SkBlitRow_opts.h source code [Skia/src/opts/SkBlitRow_opts.h]

1	/*
2	* Copyright 2015 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkBlitRow_opts_DEFINED
9	#define SkBlitRow_opts_DEFINED
10
11	#include "include/private/SkColorData.h"
12	#include "include/private/SkVx.h"
13	#include "src/core/SkMSAN.h"
14	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
15	#include <immintrin.h>
16
17	static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
18	// Abstractly srcover is
19	// b = s + d(1-srcA)*
20	//
21	// In terms of unorm8 bytes, that works out to
22	// b = s + (d(255-srcA) + 127) / 255*
23	//
24	// But we approximate that to within a bit with
25	// b = s + (d(255-srcA) + d) / 256*
26	// a.k.a
27	// b = s + (d(256-srcA)) >> 8*
28
29	// The bottleneck of this math is the multiply, and we want to do it as
30	// narrowly as possible, here getting inputs into 16-bit lanes and
31	// using 16-bit multiplies. We can do twice as many multiplies at once
32	// as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
33	// are themselves a couple cycles quicker. Win-win.
34
35	// We'll get everything in 16-bit lanes for two multiplies, one
36	// handling dst red and blue, the other green and alpha. (They're
37	// conveniently 16-bits apart, you see.) We don't need the individual
38	// src channels beyond alpha until the very end when we do the "s + "
39	// add, and we don't even need to unpack them; the adds cannot overflow.
40
41	// Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
42	const int _ = -`1`; // fills a literal 0 byte.
43	__m256i srcA_x2 = _mm256_shuffle_epi8(src,
44	_mm256_setr_epi8(`3`,_,`3`,_, `7`,_,`7`,_, `11`,_,`11`,_, `15`,_,`15`,_,
45	`3`,_,`3`,_, `7`,_,`7`,_, `11`,_,`11`,_, `15`,_,`15`,_));
46	__m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(`256`),
47	srcA_x2);
48
49	// Scale red and blue, leaving results in the low byte of each 16-bit lane.
50	__m256i rb = _mm256_and_si256(_mm256_set1_epi32(`0x00ff00ff`), dst);
51	rb = _mm256_mullo_epi16(rb, scale_x2);
52	rb = _mm256_srli_epi16 (rb, `8`);
53
54	// Scale green and alpha, leaving results in the high byte, masking off the low bits.
55	__m256i ga = _mm256_srli_epi16(dst, `8`);
56	ga = _mm256_mullo_epi16(ga, scale_x2);
57	ga = _mm256_andnot_si256(_mm256_set1_epi32(`0x00ff00ff`), ga);
58
59	return _mm256_add_epi32(src, _mm256_or_si256(rb, ga));
60	}
61
62	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
63	#include <immintrin.h>
64
65	static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
66	auto SkAlphaMulQ_SSE2 = [](const __m128i& c, const __m128i& scale) {
67	const __m128i mask = _mm_set1_epi32(`0xFF00FF`);
68	__m128i s = _mm_or_si128(_mm_slli_epi32(scale, `16`), scale);
69
70	// uint32_t rb = ((c & mask) scale) >> 8*
71	__m128i rb = _mm_and_si128(mask, c);
72	rb = _mm_mullo_epi16(rb, s);
73	rb = _mm_srli_epi16(rb, `8`);
74
75	// uint32_t ag = ((c >> 8) & mask) scale*
76	__m128i ag = _mm_srli_epi16(c, `8`);
77	ag = _mm_mullo_epi16(ag, s);
78
79	// (rb & mask) \| (ag & ~mask)
80	ag = _mm_andnot_si128(mask, ag);
81	return _mm_or_si128(rb, ag);
82	};
83	return _mm_add_epi32(src,
84	SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(`256`),
85	_mm_srli_epi32(src, `24`))));
86	}
87	#endif
88
89	namespace SK_OPTS_NS {
90
91	// Blend constant color over count src pixels, writing into dst.
92	inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
93	constexpr int N = `4`; // 8, 16 also reasonable choices
94	using U32 = skvx::Vec< N, uint32_t>;
95	using U16 = skvx::Vec<`4`*N, uint16_t>;
96	using U8 = skvx::Vec<`4`*N, uint8_t>;
97
98	auto kernel = [color](U32 src) {
99	unsigned invA = `255` - SkGetPackedA32(color);
100	invA += invA >> `7`;
101	SkASSERT(`0` < invA && invA < `256`); // We handle alpha == 0 or alpha == 255 specially.
102
103	// (src invA + (color << 8) + 128) >> 8*
104	// Should all fit in 16 bits.
105	U8 s = skvx::bit_pun<U8>(src),
106	a = U8 (invA);
107	U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32 (color))),
108	d = (mull(s,a) + (c << `8`) + `128`)>>`8`;
109	return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d));
110	};
111
112	while (count >= N) {
113	kernel(U32::Load(src)).store(dst);
114	src += N;
115	dst += N;
116	count -= N;
117	}
118	while (count --> `0`) {
119	dst++ = kernel(U32 {src++})[`0`];
120	}
121	}
122
123	#if defined(SK_ARM_HAS_NEON)
124
125	// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
126	// y[i] are the i-th lanes of the corresponding NEON vectors.
127	static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
128	uint16x8_t prod = vmull_u8(x, y);
129	return vraddhn_u16(prod, vrshrq_n_u16(prod, `8`));
130	}
131
132	// The implementations of SkPMSrcOver below perform alpha blending consistently with
133	// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
134	//
135	// result_i = src_i + rint(g(src_alpha, dst_i))
136	//
137	// where g(x, y) = ((255.0 - x) y) / 255.0 and rint rounds to the nearest integer.*
138
139	// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
140	// of the same color component for 8 consecutive pixels. The result of this function follows the
141	// same convention.
142	static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
143	uint8x8_t nalphas = vmvn_u8(src.val[`3`]);
144	uint8x8x4_t result;
145	result.val[`0`] = vadd_u8(src.val[`0`], SkMulDiv255Round_neon8(nalphas, dst.val[`0`]));
146	result.val[`1`] = vadd_u8(src.val[`1`], SkMulDiv255Round_neon8(nalphas, dst.val[`1`]));
147	result.val[`2`] = vadd_u8(src.val[`2`], SkMulDiv255Round_neon8(nalphas, dst.val[`2`]));
148	result.val[`3`] = vadd_u8(src.val[`3`], SkMulDiv255Round_neon8(nalphas, dst.val[`3`]));
149	return result;
150	}
151
152	// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
153	// pixels. The return value follows the same convention.
154	static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
155	const uint8x8_t alpha_indices = vcreate_u8(`0x0707070703030303`);
156	uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
157	return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
158	}
159
160	#endif
161
162	/not static/ inline
163	void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
164	SkASSERT(alpha == `0xFF`);
165	sk_msan_assert_initialized(src, src+len);
166	// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
167	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
168	while (len >= `32`) {
169	// Load 32 source pixels.
170	auto s0 = _mm256_loadu_si256((const __m256i*)(src) + `0`),
171	s1 = _mm256_loadu_si256((const __m256i*)(src) + `1`),
172	s2 = _mm256_loadu_si256((const __m256i*)(src) + `2`),
173	s3 = _mm256_loadu_si256((const __m256i*)(src) + `3`);
174
175	const auto alphaMask = _mm256_set1_epi32(`0xFF000000`);
176
177	auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
178	if (_mm256_testz_si256(ORed, alphaMask)) {
179	// All 32 source pixels are transparent. Nothing to do.
180	src += `32`;
181	dst += `32`;
182	len -= `32`;
183	continue;
184	}
185
186	auto d0 = (__m256i*)(dst) + `0`,
187	d1 = (__m256i*)(dst) + `1`,
188	d2 = (__m256i*)(dst) + `2`,
189	d3 = (__m256i*)(dst) + `3`;
190
191	auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
192	if (_mm256_testc_si256(ANDed, alphaMask)) {
193	// All 32 source pixels are opaque. SrcOver becomes Src.
194	_mm256_storeu_si256(d0, s0);
195	_mm256_storeu_si256(d1, s1);
196	_mm256_storeu_si256(d2, s2);
197	_mm256_storeu_si256(d3, s3);
198	src += `32`;
199	dst += `32`;
200	len -= `32`;
201	continue;
202	}
203
204	// TODO: This math is wrong.
205	// Do SrcOver.
206	_mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
207	_mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
208	_mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
209	_mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
210	src += `32`;
211	dst += `32`;
212	len -= `32`;
213	}
214
215	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
216	while (len >= `16`) {
217	// Load 16 source pixels.
218	auto s0 = _mm_loadu_si128((const __m128i*)(src) + `0`),
219	s1 = _mm_loadu_si128((const __m128i*)(src) + `1`),
220	s2 = _mm_loadu_si128((const __m128i*)(src) + `2`),
221	s3 = _mm_loadu_si128((const __m128i*)(src) + `3`);
222
223	const auto alphaMask = _mm_set1_epi32(`0xFF000000`);
224
225	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
226	if (_mm_testz_si128(ORed, alphaMask)) {
227	// All 16 source pixels are transparent. Nothing to do.
228	src += `16`;
229	dst += `16`;
230	len -= `16`;
231	continue;
232	}
233
234	auto d0 = (__m128i*)(dst) + `0`,
235	d1 = (__m128i*)(dst) + `1`,
236	d2 = (__m128i*)(dst) + `2`,
237	d3 = (__m128i*)(dst) + `3`;
238
239	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
240	if (_mm_testc_si128(ANDed, alphaMask)) {
241	// All 16 source pixels are opaque. SrcOver becomes Src.
242	_mm_storeu_si128(d0, s0);
243	_mm_storeu_si128(d1, s1);
244	_mm_storeu_si128(d2, s2);
245	_mm_storeu_si128(d3, s3);
246	src += `16`;
247	dst += `16`;
248	len -= `16`;
249	continue;
250	}
251
252	// TODO: This math is wrong.
253	// Do SrcOver.
254	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
255	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
256	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
257	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
258	src += `16`;
259	dst += `16`;
260	len -= `16`;
261	}
262
263	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
264	while (len >= `16`) {
265	// Load 16 source pixels.
266	auto s0 = _mm_loadu_si128((const __m128i*)(src) + `0`),
267	s1 = _mm_loadu_si128((const __m128i*)(src) + `1`),
268	s2 = _mm_loadu_si128((const __m128i*)(src) + `2`),
269	s3 = _mm_loadu_si128((const __m128i*)(src) + `3`);
270
271	const auto alphaMask = _mm_set1_epi32(`0xFF000000`);
272
273	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
274	if (`0xffff` == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
275	_mm_setzero_si128()))) {
276	// All 16 source pixels are transparent. Nothing to do.
277	src += `16`;
278	dst += `16`;
279	len -= `16`;
280	continue;
281	}
282
283	auto d0 = (__m128i*)(dst) + `0`,
284	d1 = (__m128i*)(dst) + `1`,
285	d2 = (__m128i*)(dst) + `2`,
286	d3 = (__m128i*)(dst) + `3`;
287
288	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
289	if (`0xffff` == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
290	alphaMask))) {
291	// All 16 source pixels are opaque. SrcOver becomes Src.
292	_mm_storeu_si128(d0, s0);
293	_mm_storeu_si128(d1, s1);
294	_mm_storeu_si128(d2, s2);
295	_mm_storeu_si128(d3, s3);
296	src += `16`;
297	dst += `16`;
298	len -= `16`;
299	continue;
300	}
301
302	// TODO: This math is wrong.
303	// Do SrcOver.
304	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
305	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
306	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
307	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
308
309	src += `16`;
310	dst += `16`;
311	len -= `16`;
312	}
313
314	#elif defined(SK_ARM_HAS_NEON)
315	// Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
316	// underperformed on some of the platforms under test for inputs with frequent transitions of
317	// alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
318	// revisiting the situation in the future.
319	while (len >= `8`) {
320	// Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
321	// for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
322	// pixels).
323	uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
324	src += `8`;
325	len -= `8`;
326
327	// We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
328	// are all transparent), the second when all alphas are fully set (they are all opaque).
329	uint8x8_t alphas = src_col.val[`3`];
330	uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), `0`);
331	if (alphas_u64 == `0`) {
332	// All pixels transparent.
333	dst += `8`;
334	continue;
335	}
336
337	if (~alphas_u64 == `0`) {
338	// All pixels opaque.
339	vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
340	dst += `8`;
341	continue;
342	}
343
344	uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
345	vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
346	dst += `8`;
347	}
348
349	// Deal with leftover pixels.
350	for (; len >= `2`; len -= `2`, src += `2`, dst += `2`) {
351	uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
352	uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
353	vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
354	}
355
356	if (len != `0`) {
357	uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)dst), vcreate_u8((uint64_t)src));
358	vst1_lane_u32(dst, vreinterpret_u32_u8(result), `0`);
359	}
360	return;
361	#endif
362
363	while (len-- > `0`) {
364	// This 0xFF000000 is not semantically necessary, but for compatibility
365	// with chromium:611002 we need to keep it until we figure out where
366	// the non-premultiplied src values (like 0x00FFFFFF) are coming from.
367	// TODO(mtklein): sort this out and assert src is premul here.*
368	if (*src & `0xFF000000`) {
369	dst = (src >= `0xFF000000`) ? src : SkPMSrcOver(src, *dst);
370	}
371	src++;
372	dst++;
373	}
374	}
375
376	} // SK_OPTS_NS
377
378	#endif//SkBlitRow_opts_DEFINED
379

Browse the source code of Skia/src/opts/SkBlitRow_opts.h