SkBlitRow_opts.h source code [engine/third_party/skia/src/opts/SkBlitRow_opts.h]

1	/*
2	* Copyright 2015 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkBlitRow_opts_DEFINED
9	#define SkBlitRow_opts_DEFINED
10
11	#include "include/private/SkColorData.h"
12	#include "include/private/SkVx.h"
13	#include "src/core/SkMSAN.h"
14	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
15	#include <immintrin.h>
16
17	static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) {
18	// Detailed explanations in SkPMSrcOver_AVX2
19	// b = s + (d(256-srcA)) >> 8*
20
21	// Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
22	const uint8_t _ = -`1`; // fills a literal 0 byte.
23	const uint8_t mask[`64`] = { `3`, _,`3`, _, `7`, _,`7`, _, `11`,_,`11`,_, `15`,_,`15`,_,
24	`19`,_,`19`,_, `23`,_,`23`,_, `27`,_,`27`,_, `31`,_,`31`,_,
25	`35`,_,`35`,_, `39`,_,`39`,_, `43`,_,`43`,_, `47`,_,`47`,_,
26	`51`,_,`51`,_, `55`,_,`55`,_, `59`,_,`59`,_, `63`,_,`63`,_ };
27	__m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask));
28	__m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(`256`),
29	srcA_x2);
30
31	// Scale red and blue, leaving results in the low byte of each 16-bit lane.
32	__m512i rb = _mm512_and_si512(_mm512_set1_epi32(`0x00ff00ff`), dst);
33	rb = _mm512_mullo_epi16(rb, scale_x2);
34	rb = _mm512_srli_epi16(rb, `8`);
35
36	// Scale green and alpha, leaving results in the high byte, masking off the low bits.
37	__m512i ga = _mm512_srli_epi16(dst, `8`);
38	ga = _mm512_mullo_epi16(ga, scale_x2);
39	ga = _mm512_andnot_si512(_mm512_set1_epi32(`0x00ff00ff`), ga);
40
41	return _mm512_add_epi32(src, _mm512_or_si512(rb, ga));
42	}
43
44	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
45	#include <immintrin.h>
46
47	static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
48	// Abstractly srcover is
49	// b = s + d(1-srcA)*
50	//
51	// In terms of unorm8 bytes, that works out to
52	// b = s + (d(255-srcA) + 127) / 255*
53	//
54	// But we approximate that to within a bit with
55	// b = s + (d(255-srcA) + d) / 256*
56	// a.k.a
57	// b = s + (d(256-srcA)) >> 8*
58
59	// The bottleneck of this math is the multiply, and we want to do it as
60	// narrowly as possible, here getting inputs into 16-bit lanes and
61	// using 16-bit multiplies. We can do twice as many multiplies at once
62	// as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
63	// are themselves a couple cycles quicker. Win-win.
64
65	// We'll get everything in 16-bit lanes for two multiplies, one
66	// handling dst red and blue, the other green and alpha. (They're
67	// conveniently 16-bits apart, you see.) We don't need the individual
68	// src channels beyond alpha until the very end when we do the "s + "
69	// add, and we don't even need to unpack them; the adds cannot overflow.
70
71	// Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
72	const int _ = -`1`; // fills a literal 0 byte.
73	__m256i srcA_x2 = _mm256_shuffle_epi8(src,
74	_mm256_setr_epi8(`3`,_,`3`,_, `7`,_,`7`,_, `11`,_,`11`,_, `15`,_,`15`,_,
75	`3`,_,`3`,_, `7`,_,`7`,_, `11`,_,`11`,_, `15`,_,`15`,_));
76	__m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(`256`),
77	srcA_x2);
78
79	// Scale red and blue, leaving results in the low byte of each 16-bit lane.
80	__m256i rb = _mm256_and_si256(_mm256_set1_epi32(`0x00ff00ff`), dst);
81	rb = _mm256_mullo_epi16(rb, scale_x2);
82	rb = _mm256_srli_epi16 (rb, `8`);
83
84	// Scale green and alpha, leaving results in the high byte, masking off the low bits.
85	__m256i ga = _mm256_srli_epi16(dst, `8`);
86	ga = _mm256_mullo_epi16(ga, scale_x2);
87	ga = _mm256_andnot_si256(_mm256_set1_epi32(`0x00ff00ff`), ga);
88
89	return _mm256_add_epi32(src, _mm256_or_si256(rb, ga));
90	}
91
92	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
93	#include <immintrin.h>
94
95	static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
96	auto SkAlphaMulQ_SSE2 = [](const __m128i& c, const __m128i& scale) {
97	const __m128i mask = _mm_set1_epi32(`0xFF00FF`);
98	__m128i s = _mm_or_si128(_mm_slli_epi32(scale, `16`), scale);
99
100	// uint32_t rb = ((c & mask) scale) >> 8*
101	__m128i rb = _mm_and_si128(mask, c);
102	rb = _mm_mullo_epi16(rb, s);
103	rb = _mm_srli_epi16(rb, `8`);
104
105	// uint32_t ag = ((c >> 8) & mask) scale*
106	__m128i ag = _mm_srli_epi16(c, `8`);
107	ag = _mm_mullo_epi16(ag, s);
108
109	// (rb & mask) \| (ag & ~mask)
110	ag = _mm_andnot_si128(mask, ag);
111	return _mm_or_si128(rb, ag);
112	};
113	return _mm_add_epi32(src,
114	SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(`256`),
115	_mm_srli_epi32(src, `24`))));
116	}
117	#endif
118
119	namespace SK_OPTS_NS {
120
121	// Blend constant color over count src pixels, writing into dst.
122	inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
123	constexpr int N = `4`; // 8, 16 also reasonable choices
124	using U32 = skvx::Vec< N, uint32_t>;
125	using U16 = skvx::Vec<`4`*N, uint16_t>;
126	using U8 = skvx::Vec<`4`*N, uint8_t>;
127
128	auto kernel = [color](U32 src) {
129	unsigned invA = `255` - SkGetPackedA32(color);
130	invA += invA >> `7`;
131	SkASSERT(`0` < invA && invA < `256`); // We handle alpha == 0 or alpha == 255 specially.
132
133	// (src invA + (color << 8) + 128) >> 8*
134	// Should all fit in 16 bits.
135	U8 s = skvx::bit_pun<U8>(src),
136	a = U8 (invA);
137	U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32 (color))),
138	d = (mull(s,a) + (c << `8`) + `128`)>>`8`;
139	return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d));
140	};
141
142	while (count >= N) {
143	kernel(U32::Load(src)).store(dst);
144	src += N;
145	dst += N;
146	count -= N;
147	}
148	while (count --> `0`) {
149	dst++ = kernel(U32 {src++})[`0`];
150	}
151	}
152
153	#if defined(SK_ARM_HAS_NEON)
154
155	// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
156	// y[i] are the i-th lanes of the corresponding NEON vectors.
157	static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
158	uint16x8_t prod = vmull_u8(x, y);
159	return vraddhn_u16(prod, vrshrq_n_u16(prod, `8`));
160	}
161
162	// The implementations of SkPMSrcOver below perform alpha blending consistently with
163	// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
164	//
165	// result_i = src_i + rint(g(src_alpha, dst_i))
166	//
167	// where g(x, y) = ((255.0 - x) y) / 255.0 and rint rounds to the nearest integer.*
168
169	// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
170	// of the same color component for 8 consecutive pixels. The result of this function follows the
171	// same convention.
172	static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
173	uint8x8_t nalphas = vmvn_u8(src.val[`3`]);
174	uint8x8x4_t result;
175	result.val[`0`] = vadd_u8(src.val[`0`], SkMulDiv255Round_neon8(nalphas, dst.val[`0`]));
176	result.val[`1`] = vadd_u8(src.val[`1`], SkMulDiv255Round_neon8(nalphas, dst.val[`1`]));
177	result.val[`2`] = vadd_u8(src.val[`2`], SkMulDiv255Round_neon8(nalphas, dst.val[`2`]));
178	result.val[`3`] = vadd_u8(src.val[`3`], SkMulDiv255Round_neon8(nalphas, dst.val[`3`]));
179	return result;
180	}
181
182	// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
183	// pixels. The return value follows the same convention.
184	static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
185	const uint8x8_t alpha_indices = vcreate_u8(`0x0707070703030303`);
186	uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
187	return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
188	}
189
190	#endif
191
192	/not static/ inline
193	void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
194	SkASSERT(alpha == `0xFF`);
195	sk_msan_assert_initialized(src, src+len);
196	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
197	while (len >= `64`) {
198	// Load 64 source pixels.
199	auto s0 = _mm512_loadu_si512((const __m512i*)(src) + `0`),
200	s1 = _mm512_loadu_si512((const __m512i*)(src) + `1`),
201	s2 = _mm512_loadu_si512((const __m512i*)(src) + `2`),
202	s3 = _mm512_loadu_si512((const __m512i*)(src) + `3`);
203
204	const auto alphaMask = _mm512_set1_epi32(`0xFF000000`);
205
206	auto ORed = _mm512_or_si512(s3, _mm512_or_si512(s2, _mm512_or_si512(s1, s0)));
207	if (`0` == _mm512_cmpneq_epi8_mask(_mm512_and_si512(ORed, alphaMask),
208	_mm512_setzero_si512())) {
209	// All 64 source pixels are transparent. Nothing to do.
210	src += `64`;
211	dst += `64`;
212	len -= `64`;
213	continue;
214	}
215
216	auto d0 = (__m512i*)(dst) + `0`,
217	d1 = (__m512i*)(dst) + `1`,
218	d2 = (__m512i*)(dst) + `2`,
219	d3 = (__m512i*)(dst) + `3`;
220
221	auto ANDed = _mm512_and_si512(s3, _mm512_and_si512(s2, _mm512_and_si512(s1, s0)));
222	if (`0` == _mm512_cmpneq_epi8_mask(_mm512_and_si512(ANDed, alphaMask),
223	alphaMask)) {
224	// All 64 source pixels are opaque. SrcOver becomes Src.
225	_mm512_storeu_si512(d0, s0);
226	_mm512_storeu_si512(d1, s1);
227	_mm512_storeu_si512(d2, s2);
228	_mm512_storeu_si512(d3, s3);
229	src += `64`;
230	dst += `64`;
231	len -= `64`;
232	continue;
233	}
234
235	// TODO: This math is wrong.
236	// Do SrcOver.
237	_mm512_storeu_si512(d0, SkPMSrcOver_SKX(s0, _mm512_loadu_si512(d0)));
238	_mm512_storeu_si512(d1, SkPMSrcOver_SKX(s1, _mm512_loadu_si512(d1)));
239	_mm512_storeu_si512(d2, SkPMSrcOver_SKX(s2, _mm512_loadu_si512(d2)));
240	_mm512_storeu_si512(d3, SkPMSrcOver_SKX(s3, _mm512_loadu_si512(d3)));
241	src += `64`;
242	dst += `64`;
243	len -= `64`;
244	}
245
246	// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
247	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
248	while (len >= `32`) {
249	// Load 32 source pixels.
250	auto s0 = _mm256_loadu_si256((const __m256i*)(src) + `0`),
251	s1 = _mm256_loadu_si256((const __m256i*)(src) + `1`),
252	s2 = _mm256_loadu_si256((const __m256i*)(src) + `2`),
253	s3 = _mm256_loadu_si256((const __m256i*)(src) + `3`);
254
255	const auto alphaMask = _mm256_set1_epi32(`0xFF000000`);
256
257	auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
258	if (_mm256_testz_si256(ORed, alphaMask)) {
259	// All 32 source pixels are transparent. Nothing to do.
260	src += `32`;
261	dst += `32`;
262	len -= `32`;
263	continue;
264	}
265
266	auto d0 = (__m256i*)(dst) + `0`,
267	d1 = (__m256i*)(dst) + `1`,
268	d2 = (__m256i*)(dst) + `2`,
269	d3 = (__m256i*)(dst) + `3`;
270
271	auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
272	if (_mm256_testc_si256(ANDed, alphaMask)) {
273	// All 32 source pixels are opaque. SrcOver becomes Src.
274	_mm256_storeu_si256(d0, s0);
275	_mm256_storeu_si256(d1, s1);
276	_mm256_storeu_si256(d2, s2);
277	_mm256_storeu_si256(d3, s3);
278	src += `32`;
279	dst += `32`;
280	len -= `32`;
281	continue;
282	}
283
284	// TODO: This math is wrong.
285	// Do SrcOver.
286	_mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
287	_mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
288	_mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
289	_mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
290	src += `32`;
291	dst += `32`;
292	len -= `32`;
293	}
294
295	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
296	while (len >= `16`) {
297	// Load 16 source pixels.
298	auto s0 = _mm_loadu_si128((const __m128i*)(src) + `0`),
299	s1 = _mm_loadu_si128((const __m128i*)(src) + `1`),
300	s2 = _mm_loadu_si128((const __m128i*)(src) + `2`),
301	s3 = _mm_loadu_si128((const __m128i*)(src) + `3`);
302
303	const auto alphaMask = _mm_set1_epi32(`0xFF000000`);
304
305	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
306	if (_mm_testz_si128(ORed, alphaMask)) {
307	// All 16 source pixels are transparent. Nothing to do.
308	src += `16`;
309	dst += `16`;
310	len -= `16`;
311	continue;
312	}
313
314	auto d0 = (__m128i*)(dst) + `0`,
315	d1 = (__m128i*)(dst) + `1`,
316	d2 = (__m128i*)(dst) + `2`,
317	d3 = (__m128i*)(dst) + `3`;
318
319	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
320	if (_mm_testc_si128(ANDed, alphaMask)) {
321	// All 16 source pixels are opaque. SrcOver becomes Src.
322	_mm_storeu_si128(d0, s0);
323	_mm_storeu_si128(d1, s1);
324	_mm_storeu_si128(d2, s2);
325	_mm_storeu_si128(d3, s3);
326	src += `16`;
327	dst += `16`;
328	len -= `16`;
329	continue;
330	}
331
332	// TODO: This math is wrong.
333	// Do SrcOver.
334	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
335	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
336	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
337	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
338	src += `16`;
339	dst += `16`;
340	len -= `16`;
341	}
342
343	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
344	while (len >= `16`) {
345	// Load 16 source pixels.
346	auto s0 = _mm_loadu_si128((const __m128i*)(src) + `0`),
347	s1 = _mm_loadu_si128((const __m128i*)(src) + `1`),
348	s2 = _mm_loadu_si128((const __m128i*)(src) + `2`),
349	s3 = _mm_loadu_si128((const __m128i*)(src) + `3`);
350
351	const auto alphaMask = _mm_set1_epi32(`0xFF000000`);
352
353	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
354	if (`0xffff` == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
355	_mm_setzero_si128()))) {
356	// All 16 source pixels are transparent. Nothing to do.
357	src += `16`;
358	dst += `16`;
359	len -= `16`;
360	continue;
361	}
362
363	auto d0 = (__m128i*)(dst) + `0`,
364	d1 = (__m128i*)(dst) + `1`,
365	d2 = (__m128i*)(dst) + `2`,
366	d3 = (__m128i*)(dst) + `3`;
367
368	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
369	if (`0xffff` == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
370	alphaMask))) {
371	// All 16 source pixels are opaque. SrcOver becomes Src.
372	_mm_storeu_si128(d0, s0);
373	_mm_storeu_si128(d1, s1);
374	_mm_storeu_si128(d2, s2);
375	_mm_storeu_si128(d3, s3);
376	src += `16`;
377	dst += `16`;
378	len -= `16`;
379	continue;
380	}
381
382	// TODO: This math is wrong.
383	// Do SrcOver.
384	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
385	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
386	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
387	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
388
389	src += `16`;
390	dst += `16`;
391	len -= `16`;
392	}
393
394	#elif defined(SK_ARM_HAS_NEON)
395	// Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
396	// underperformed on some of the platforms under test for inputs with frequent transitions of
397	// alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
398	// revisiting the situation in the future.
399	while (len >= `8`) {
400	// Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
401	// for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
402	// pixels).
403	uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
404	src += `8`;
405	len -= `8`;
406
407	// We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
408	// are all transparent), the second when all alphas are fully set (they are all opaque).
409	uint8x8_t alphas = src_col.val[`3`];
410	uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), `0`);
411	if (alphas_u64 == `0`) {
412	// All pixels transparent.
413	dst += `8`;
414	continue;
415	}
416
417	if (~alphas_u64 == `0`) {
418	// All pixels opaque.
419	vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
420	dst += `8`;
421	continue;
422	}
423
424	uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
425	vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
426	dst += `8`;
427	}
428
429	// Deal with leftover pixels.
430	for (; len >= `2`; len -= `2`, src += `2`, dst += `2`) {
431	uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
432	uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
433	vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
434	}
435
436	if (len != `0`) {
437	uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)dst), vcreate_u8((uint64_t)src));
438	vst1_lane_u32(dst, vreinterpret_u32_u8(result), `0`);
439	}
440	return;
441	#endif
442
443	while (len-- > `0`) {
444	// This 0xFF000000 is not semantically necessary, but for compatibility
445	// with chromium:611002 we need to keep it until we figure out where
446	// the non-premultiplied src values (like 0x00FFFFFF) are coming from.
447	// TODO(mtklein): sort this out and assert src is premul here.*
448	if (*src & `0xFF000000`) {
449	dst = (src >= `0xFF000000`) ? src : SkPMSrcOver(src, *dst);
450	}
451	src++;
452	dst++;
453	}
454	}
455
456	} // namespace SK_OPTS_NS
457
458	#endif//SkBlitRow_opts_DEFINED
459

Browse the source code of engine/third_party/skia/src/opts/SkBlitRow_opts.h