SkBlitRow_D32.cpp source code [Skia/src/core/SkBlitRow_D32.cpp]

1	/*
2	* Copyright 2011 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#include "include/private/SkColorData.h"
9	#include "src/core/SkBlitRow.h"
10	#include "src/core/SkOpts.h"
11	#include "src/core/SkUtils.h"
12
13	// Everyone agrees memcpy() is the best way to do this.
14	static void blit_row_s32_opaque(SkPMColor* dst,
15	const SkPMColor* src,
16	int count,
17	U8CPU alpha) {
18	SkASSERT(`255` == alpha);
19	memcpy(dst, src, count * sizeof(SkPMColor));
20	}
21
22	// We have SSE2, NEON, and portable implementations of
23	// blit_row_s32_blend() and blit_row_s32a_blend().
24
25	// TODO(mtklein): can we do better in NEON than 2 pixels at a time?
26
27	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
28	#include <emmintrin.h>
29
30	static inline __m128i SkPMLerp_SSE2(const __m128i& src,
31	const __m128i& dst,
32	const unsigned src_scale) {
33	// Computes dst + (((src - dst)src_scale)>>8)*
34	const __m128i mask = _mm_set1_epi32(`0x00FF00FF`);
35
36	// Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
37	__m128i src_rb = _mm_and_si128(mask, src);
38	__m128i src_ag = _mm_srli_epi16(src, `8`);
39	__m128i dst_rb = _mm_and_si128(mask, dst);
40	__m128i dst_ag = _mm_srli_epi16(dst, `8`);
41
42	// Compute scaled differences.
43	__m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
44	__m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
45	__m128i s = _mm_set1_epi16(src_scale);
46	diff_rb = _mm_mullo_epi16(diff_rb, s);
47	diff_ag = _mm_mullo_epi16(diff_ag, s);
48
49	// Pack the differences back together.
50	diff_rb = _mm_srli_epi16(diff_rb, `8`);
51	diff_ag = _mm_andnot_si128(mask, diff_ag);
52	__m128i diff = _mm_or_si128(diff_rb, diff_ag);
53
54	// Add difference to destination.
55	return _mm_add_epi8(dst, diff);
56	}
57
58
59	static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
60	SkASSERT(alpha <= `255`);
61
62	auto src4 = (const __m128i*)src;
63	auto dst4 = ( __m128i*)dst;
64
65	while (count >= `4`) {
66	_mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
67	_mm_loadu_si128(dst4),
68	SkAlpha255To256(alpha)));
69	src4++;
70	dst4++;
71	count -= `4`;
72	}
73
74	src = (const SkPMColor*)src4;
75	dst = ( SkPMColor*)dst4;
76
77	while (count --> `0`) {
78	dst = SkPMLerp(src, *dst, SkAlpha255To256(alpha));
79	src++;
80	dst++;
81	}
82	}
83
84	static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
85	const __m128i& dst,
86	const unsigned aa) {
87	unsigned alpha = SkAlpha255To256(aa);
88	__m128i src_scale = _mm_set1_epi16(alpha);
89	// SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
90	__m128i dst_scale = _mm_srli_epi32(src, `24`);
91	// High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
92	dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
93	dst_scale = _mm_sub_epi32(_mm_set1_epi32(`0xFFFF`), dst_scale);
94	dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, `8`));
95	dst_scale = _mm_srli_epi32(dst_scale, `8`);
96	// Duplicate scales into 2x16-bit pattern per pixel.
97	dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(`2`, `2`, `0`, `0`));
98	dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(`2`, `2`, `0`, `0`));
99
100	const __m128i mask = _mm_set1_epi32(`0x00FF00FF`);
101
102	// Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
103	__m128i src_rb = _mm_and_si128(mask, src);
104	__m128i src_ag = _mm_srli_epi16(src, `8`);
105	__m128i dst_rb = _mm_and_si128(mask, dst);
106	__m128i dst_ag = _mm_srli_epi16(dst, `8`);
107
108	// Scale them.
109	src_rb = _mm_mullo_epi16(src_rb, src_scale);
110	src_ag = _mm_mullo_epi16(src_ag, src_scale);
111	dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
112	dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
113
114	// Add the scaled source and destination.
115	dst_rb = _mm_add_epi16(src_rb, dst_rb);
116	dst_ag = _mm_add_epi16(src_ag, dst_ag);
117
118	// Unsplay the halves back together.
119	dst_rb = _mm_srli_epi16(dst_rb, `8`);
120	dst_ag = _mm_andnot_si128(mask, dst_ag);
121	return _mm_or_si128(dst_rb, dst_ag);
122	}
123
124	static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
125	SkASSERT(alpha <= `255`);
126
127	auto src4 = (const __m128i*)src;
128	auto dst4 = ( __m128i*)dst;
129
130	while (count >= `4`) {
131	_mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
132	_mm_loadu_si128(dst4),
133	alpha));
134	src4++;
135	dst4++;
136	count -= `4`;
137	}
138
139	src = (const SkPMColor*)src4;
140	dst = ( SkPMColor*)dst4;
141
142	while (count --> `0`) {
143	dst = SkBlendARGB32(src, *dst, alpha);
144	src++;
145	dst++;
146	}
147	}
148
149	#elif defined(SK_ARM_HAS_NEON)
150	#include <arm_neon.h>
151
152	static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
153	SkASSERT(alpha <= `255`);
154
155	uint16_t src_scale = SkAlpha255To256(alpha);
156	uint16_t dst_scale = `256` - src_scale;
157
158	while (count >= `2`) {
159	uint8x8_t vsrc, vdst, vres;
160	uint16x8_t vsrc_wide, vdst_wide;
161
162	vsrc = vreinterpret_u8_u32(vld1_u32(src));
163	vdst = vreinterpret_u8_u32(vld1_u32(dst));
164
165	vsrc_wide = vmovl_u8(vsrc);
166	vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
167
168	vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
169
170	vdst_wide += vsrc_wide;
171	vres = vshrn_n_u16(vdst_wide, `8`);
172
173	vst1_u32(dst, vreinterpret_u32_u8(vres));
174
175	src += `2`;
176	dst += `2`;
177	count -= `2`;
178	}
179
180	if (count == `1`) {
181	uint8x8_t vsrc = vdup_n_u8(`0`), vdst = vdup_n_u8(`0`), vres;
182	uint16x8_t vsrc_wide, vdst_wide;
183
184	vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), `0`));
185	vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), `0`));
186
187	vsrc_wide = vmovl_u8(vsrc);
188	vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
189	vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
190	vdst_wide += vsrc_wide;
191	vres = vshrn_n_u16(vdst_wide, `8`);
192
193	vst1_lane_u32(dst, vreinterpret_u32_u8(vres), `0`);
194	}
195	}
196
197	static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
198	SkASSERT(alpha < `255`);
199
200	unsigned alpha256 = SkAlpha255To256(alpha);
201
202	if (count & `1`) {
203	uint8x8_t vsrc = vdup_n_u8(`0`), vdst = vdup_n_u8(`0`), vres;
204	uint16x8_t vdst_wide, vsrc_wide;
205	unsigned dst_scale;
206
207	vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), `0`));
208	vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), `0`));
209
210	dst_scale = vget_lane_u8(vsrc, `3`);
211	dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
212
213	vsrc_wide = vmovl_u8(vsrc);
214	vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
215
216	vdst_wide = vmovl_u8(vdst);
217	vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
218
219	vdst_wide += vsrc_wide;
220	vres = vshrn_n_u16(vdst_wide, `8`);
221
222	vst1_lane_u32(dst, vreinterpret_u32_u8(vres), `0`);
223	dst++;
224	src++;
225	count--;
226	}
227
228	uint8x8_t alpha_mask;
229	static const uint8_t alpha_mask_setup[] = {`3`,`3`,`3`,`3`,`7`,`7`,`7`,`7`};
230	alpha_mask = vld1_u8(alpha_mask_setup);
231
232	while (count) {
233
234	uint8x8_t vsrc, vdst, vres, vsrc_alphas;
235	uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
236
237	__builtin_prefetch(src+`32`);
238	__builtin_prefetch(dst+`32`);
239
240	vsrc = vreinterpret_u8_u32(vld1_u32(src));
241	vdst = vreinterpret_u8_u32(vld1_u32(dst));
242
243	vsrc_scale = vdupq_n_u16(alpha256);
244
245	vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
246	vdst_scale = vmovl_u8(vsrc_alphas);
247	// Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
248	// A 16-bit lane would overflow if we used 0xFFFF here,
249	// so use an approximation with 0xFF00 that is off by 1,
250	// and add back 1 after to get the correct value.
251	// This is valid if alpha256 <= 255.
252	vdst_scale = vmlsq_u16(vdupq_n_u16(`0xFF00`), vdst_scale, vsrc_scale);
253	vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, `8`);
254	vdst_scale = vsraq_n_u16(vdupq_n_u16(`1`), vdst_scale, `8`);
255
256	vsrc_wide = vmovl_u8(vsrc);
257	vsrc_wide *= vsrc_scale;
258
259	vdst_wide = vmovl_u8(vdst);
260	vdst_wide *= vdst_scale;
261
262	vdst_wide += vsrc_wide;
263	vres = vshrn_n_u16(vdst_wide, `8`);
264
265	vst1_u32(dst, vreinterpret_u32_u8(vres));
266
267	src += `2`;
268	dst += `2`;
269	count -= `2`;
270	}
271	}
272
273	#else
274	static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
275	SkASSERT(alpha <= `255`);
276	while (count --> `0`) {
277	dst = SkPMLerp(src, *dst, SkAlpha255To256(alpha));
278	src++;
279	dst++;
280	}
281	}
282
283	static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
284	SkASSERT(alpha <= `255`);
285	while (count --> `0`) {
286	dst = SkBlendARGB32(src, *dst, alpha);
287	src++;
288	dst++;
289	}
290	}
291	#endif
292
293	SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
294	static const SkBlitRow::Proc32 kProcs[] = {
295	blit_row_s32_opaque,
296	blit_row_s32_blend,
297	nullptr, // blit_row_s32a_opaque is in SkOpts
298	blit_row_s32a_blend
299	};
300
301	SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
302	flags &= SK_ARRAY_COUNT(kProcs) - `1`; // just to be safe
303
304	return flags == `2` ? SkOpts::blit_row_s32a_opaque
305	: kProcs[flags];
306	}
307
308	void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
309	switch (SkGetPackedA32(color)) {
310	case `0`: memmove(dst, src, count * sizeof(SkPMColor)); return;
311	case `255`: sk_memset32(dst, color, count); return;
312	}
313	return SkOpts::blit_row_color32(dst, src, count, color);
314	}
315

Browse the source code of Skia/src/core/SkBlitRow_D32.cpp