SkBitmapProcState_opts.h source code [engine/third_party/skia/src/opts/SkBitmapProcState_opts.h]

1	/*
2	* Copyright 2018 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkBitmapProcState_opts_DEFINED
9	#define SkBitmapProcState_opts_DEFINED
10
11	#include "include/private/SkVx.h"
12	#include "src/core/SkBitmapProcState.h"
13	#include "src/core/SkMSAN.h"
14
15	// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16	//
17	// Only S32_alpha_D32_filter_DX exploits instructions beyond
18	// our common baseline SSE2/NEON instruction sets, so that's
19	// all that lives here.
20	//
21	// The rest are scattershot at the moment but I want to get them
22	// all migrated to be normal code inside SkBitmapProcState.cpp.
23
24	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25	#include <immintrin.h>
26	#elif defined(SK_ARM_HAS_NEON)
27	#include <arm_neon.h>
28	#endif
29
30	namespace SK_OPTS_NS {
31
32	// This same basic packing scheme is used throughout the file.
33	template <typename U32, typename Out>
34	static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
35	v0 = (packed >> `18`); // Integer coordinate x0 or y0.*
36	v1 = (packed & `0x3fff`); // Integer coordinate x1 or y1.*
37	w = (packed >> `14`) & `0xf`; // Lerp weight for v1; weight for v0 is 16-w.*
38	}
39
40	#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
41	/not static/ inline
42	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
43	const uint32_t* xy, int count, uint32_t* colors) {
44	SkASSERT(count > `0` && colors != nullptr);
45	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
46	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
47	SkASSERT(s.fAlphaScale <= `256`);
48
49	// In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight.
50	int y0, y1, wy;
51	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
52
53	const uint32_t* row0 = s.fPixmap.addr32(`0`,y0);
54	const uint32_t* row1 = s.fPixmap.addr32(`0`,y1);
55
56	auto bilerp = [&](skvx::Vec<`8`,uint32_t> packed_x_coordinates) -> skvx::Vec<`8`,uint32_t> {
57	// Decode up to 8 output pixels' x-coordinates and weights.
58	skvx::Vec<`8`,uint32_t> x0,x1,wx;
59	decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx);
60
61	// Splat wx to each color channel.
62	wx = (wx << `0`)
63	\| (wx << `8`)
64	\| (wx << `16`)
65	\| (wx << `24`);
66
67	auto gather = [](const uint32_t* ptr, skvx::Vec<`8`,uint32_t> ix) {
68	#if 1
69	// Drop into AVX2 intrinsics for vpgatherdd.
70	return skvx::bit_pun<skvx::Vec<`8`,uint32_t>>(
71	_mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), `4`));
72	#else
73	// Portable version... sometimes I don't trust vpgatherdd.
74	return skvx::Vec<`8`,uint32_t>{
75	ptr[ix[`0`]], ptr[ix[`1`]], ptr[ix[`2`]], ptr[ix[`3`]],
76	ptr[ix[`4`]], ptr[ix[`5`]], ptr[ix[`6`]], ptr[ix[`7`]],
77	};
78	#endif
79	};
80
81	// Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels.
82	skvx::Vec<`8`,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1),
83	bl = gather(row1, x0), br = gather(row1, x1);
84
85	#if 1
86	// We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code.
87	auto lerp_x = [&](skvx::Vec<`8`,uint32_t> L, skvx::Vec<`8`,uint32_t> R) {
88	__m256i l = skvx::bit_pun<__m256i>(L),
89	r = skvx::bit_pun<__m256i>(R),
90	wr = skvx::bit_pun<__m256i>(wx),
91	wl = _mm256_sub_epi8(_mm256_set1_epi8(`16`), wr);
92
93	// Interlace l,r bytewise and line them up with their weights, then lerp.
94	__m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r),
95	_mm256_unpacklo_epi8(wl,wr));
96	__m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r),
97	_mm256_unpackhi_epi8(wl,wr));
98
99	// Those _mm256_unpack??_epi8() calls left us in a bit of an odd order:
100	//
101	// if l = a b c d \| e f g h
102	// and r = A B C D \| E F G H
103	//
104	// then lo = a A b B \| e E f F (low half of each input)
105	// and hi = c C d D \| g G h H (high half of each input)
106	//
107	// To get everything back in original order we need to transpose that.
108	__m256i abcd = _mm256_permute2x128_si256(lo, hi, `0x20`),
109	efgh = _mm256_permute2x128_si256(lo, hi, `0x31`);
110
111	return skvx::join(skvx::bit_pun<skvx::Vec<`16`,uint16_t>>(abcd),
112	skvx::bit_pun<skvx::Vec<`16`,uint16_t>>(efgh));
113	};
114
115	skvx::Vec<`32`, uint16_t> top = lerp_x(tl, tr),
116	bot = lerp_x(bl, br),
117	sum = `16`top + (bot-top)wy;
118	#else
119	// Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply.
120	auto to_16x4 = [](auto v) -> skvx::Vec<`32`, uint16_t> {
121	return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<`32`, uint8_t>>(v));
122	};
123
124	// Sum up weighted sample pixels. The naive, redundant math would be,
125	//
126	// sum = tl (16-wy) * (16-wx)*
127	// + bl ( wy) * (16-wx)*
128	// + tr (16-wy) * ( wx)*
129	// + br ( wy) * ( wx)*
130	//
131	// But we refactor to eliminate a bunch of those common factors.
132	auto lerp = [](auto lo, auto hi, auto w) {
133	return `16`lo + (hi-lo)w;
134	};
135	skvx::Vec<`32`, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy),
136	lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx));
137	#endif
138
139	// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
140	sum >>= `8`;
141
142	// Scale by alpha if needed.
143	if(s.fAlphaScale < `256`) {
144	sum *= s.fAlphaScale;
145	sum >>= `8`;
146	}
147
148	// Pack back to 8-bit channels, undoing to_16x4().
149	return skvx::bit_pun<skvx::Vec<`8`,uint32_t>>(skvx::cast<uint8_t>(sum));
150	};
151
152	while (count >= `8`) {
153	bilerp(skvx::Vec<`8`,uint32_t>::Load(xy)).store(colors);
154	xy += `8`;
155	colors += `8`;
156	count -= `8`;
157	}
158	if (count > `0`) {
159	__m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<`8`,int>{`0`,`1`,`2`,`3`, `4`,`5`,`6`,`7`} ),
160	coords = _mm256_maskload_epi32((const int*)xy, active),
161	pixels;
162
163	bilerp(skvx::bit_pun<skvx::Vec<`8`,uint32_t>>(coords)).store(&pixels);
164	_mm256_maskstore_epi32((int*)colors, active, pixels);
165
166	sk_msan_mark_initialized(colors, colors+count,
167	"MSAN still doesn't understand AVX2 mask loads and stores.");
168	}
169	}
170
171	#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
172
173	/not static/ inline
174	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
175	const uint32_t* xy, int count, uint32_t* colors) {
176	SkASSERT(count > `0` && colors != nullptr);
177	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
178	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
179	SkASSERT(s.fAlphaScale <= `256`);
180
181	// interpolate_in_x() is the crux of the SSSE3 implementation,
182	// interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
183	auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
184	uint32_t B0, uint32_t B1,
185	__m128i interlaced_x_weights) {
186	// _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
187	//
188	// It takes two arguments interlaced byte-wise:
189	// - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
190	// - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]
191	// and returns 8 signed 16-bit values: [ lw + rW, ... 7 more ... ].
192	//
193	// That's why we go to all this trouble to make interlaced_x_weights,
194	// and here we're about to interlace A0 with A1 and B0 with B1 to match.
195	//
196	// Our interlaced_x_weights are all in [0,16], and so we need not worry about
197	// the signedness of that input nor about the signedness of the output.
198
199	__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
200	interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
201
202	return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
203	interlaced_x_weights);
204	};
205
206	// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
207	// Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
208	auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
209	uint32_t A2, uint32_t A3,
210	uint32_t B0, uint32_t B1,
211	uint32_t B2, uint32_t B3,
212	__m128i interlaced_x_weights,
213	int wy) {
214	// Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
215	__m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
216	bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
217
218	// Interpolate in Y. As in the SSE2 code, we calculate top(16-wy) + botwy
219	// as 16top + (bot-top)wy to save a multiply.
220	__m128i px = _mm_add_epi16(_mm_slli_epi16(top, `4`),
221	_mm_mullo_epi16(_mm_sub_epi16(bot, top),
222	_mm_set1_epi16(wy)));
223
224	// Scale down by total max weight 16x16 = 256.
225	px = _mm_srli_epi16(px, `8`);
226
227	// Scale by alpha if needed.
228	if (s.fAlphaScale < `256`) {
229	px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), `8`);
230	}
231	return px;
232	};
233
234	// We're in _DX mode here, so we're only varying in X.
235	// That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
236	// All the other entries in xy will be pairs of X coordinates and the X weight.
237	int y0, y1, wy;
238	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
239
240	auto row0 = (const uint32_t)((const* uint8_t)s.fPixmap.addr() + y0 s.fPixmap.rowBytes()),
241	row1 = (const uint32_t)((const* uint8_t)s.fPixmap.addr() + y1 s.fPixmap.rowBytes());
242
243	while (count >= `4`) {
244	// We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
245	int x0[`4`],
246	x1[`4`];
247	__m128i wx;
248
249	// decode_packed_coordinates_and_weight(), 4x.
250	__m128i packed = _mm_loadu_si128((const __m128i*)xy);
251	_mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, `18`));
252	_mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(`0x3fff`)));
253	wx = _mm_and_si128(_mm_srli_epi32(packed, `14`), _mm_set1_epi32(`0xf`)); // [0,15]
254
255	// Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
256	// and sixteen minus that as wl for pixels on the left at x0.
257	__m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(`0`,`0`,`0`,`0`,`4`,`4`,`4`,`4`,`8`,`8`,`8`,`8`,`12`,`12`,`12`,`12`)),
258	wl = _mm_sub_epi8(_mm_set1_epi8(`16`), wr);
259
260	// We need to interlace wl and wr for _mm_maddubs_epi16().
261	__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
262	interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
263
264	enum { A,B,C,D };
265
266	// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
267	// from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
268	__m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
269	row1[x0[A]], row1[x1[A]],
270	row0[x0[B]], row0[x1[B]],
271	row1[x0[B]], row1[x1[B]],
272	interlaced_x_weights_AB, wy);
273
274	// Once more with the other half of the x-weights for two more pixels C,D.
275	__m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
276	row1[x0[C]], row1[x1[C]],
277	row0[x0[D]], row0[x1[D]],
278	row1[x0[D]], row1[x1[D]],
279	interlaced_x_weights_CD, wy);
280
281	// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
282	_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
283	xy += `4`;
284	colors += `4`;
285	count -= `4`;
286	}
287
288	while (count --> `0`) {
289	// This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
290	int x0, x1, wx;
291	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
292
293	// As above, splat out wx four times as wr, and sixteen minus that as wl.
294	__m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
295	wl = _mm_sub_epi8(_mm_set1_epi8(`16`), wr);
296
297	__m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
298
299	__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
300	row1[x0], row1[x1],
301	`0`, `0`,
302	`0`, `0`,
303	interlaced_x_weights, wy);
304
305	*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
306	}
307	}
308
309
310	#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
311
312	/not static/ inline
313	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
314	const uint32_t* xy, int count, uint32_t* colors) {
315	SkASSERT(count > `0` && colors != nullptr);
316	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
317	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
318	SkASSERT(s.fAlphaScale <= `256`);
319
320	int y0, y1, wy;
321	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
322
323	auto row0 = (const uint32_t)( (const* char)s.fPixmap.addr() + y0 s.fPixmap.rowBytes() ),
324	row1 = (const uint32_t)( (const* char)s.fPixmap.addr() + y1 s.fPixmap.rowBytes() );
325
326	// We'll put one pixel in the low 4 16-bit lanes to line up with wy,
327	// and another in the upper 4 16-bit lanes to line up with 16 - wy.
328	const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
329	_mm_set1_epi16(`16`-wy)); // Top pixel goes here.
330
331	while (count --> `0`) {
332	int x0, x1, wx;
333	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
334
335	// Load the 4 pixels we're interpolating, in this grid:
336	// \| tl tr \|
337	// \| bl br \|
338	const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
339	bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
340
341	// We want to calculate a sum of 4 pixels weighted in two directions:
342	//
343	// sum = tl (16-wy) * (16-wx)*
344	// + bl ( wy) * (16-wx)*
345	// + tr (16-wy) * ( wx)*
346	// + br ( wy) * ( wx)*
347	//
348	// (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
349	//
350	// We've already prepared allY as a vector containing [wy, 16-wy] as a way
351	// to apply those y-direction weights. So we'll start on the x-direction
352	// first, grouping into left and right halves, lined up with allY:
353	//
354	// L = [bl, tl]
355	// R = [br, tr]
356	//
357	// sum = horizontalSum( allY (L(16-wx) + Rwx) )*
358	//
359	// Rewriting that one more step, we can replace a multiply with a shift:
360	//
361	// sum = horizontalSum( allY (16L + (R-L)wx) )*
362	//
363	// That's how we'll actually do this math.
364
365	__m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
366	R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
367
368	__m128i inner = _mm_add_epi16(_mm_slli_epi16(L, `4`),
369	_mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
370
371	__m128i sum_in_x = _mm_mullo_epi16(inner, allY);
372
373	// sum = horizontalSum( ... )
374	__m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, `8`));
375
376	// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
377	sum = _mm_srli_epi16(sum, `8`);
378
379	if (s.fAlphaScale < `256`) {
380	// Scale by alpha, which is in [0,256].
381	sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
382	sum = _mm_srli_epi16(sum, `8`);
383	}
384
385	// Pack back into 8-bit values and store.
386	*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
387	}
388	}
389
390	#else
391
392	// The NEON code only actually differs from the portable code in the
393	// filtering step after we've loaded all four pixels we want to bilerp.
394
395	#if defined(SK_ARM_HAS_NEON)
396	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
397	SkPMColor a00, SkPMColor a01,
398	SkPMColor a10, SkPMColor a11,
399	SkPMColor *dst,
400	uint16_t scale) {
401	uint8x8_t vy, vconst16_8, v16_y, vres;
402	uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
403	uint32x2_t va0, va1;
404	uint16x8_t tmp1, tmp2;
405
406	vy = vdup_n_u8(y); // duplicate y into vy
407	vconst16_8 = vmov_n_u8(`16`); // set up constant in vconst16_8
408	v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
409
410	va0 = vdup_n_u32(a00); // duplicate a00
411	va1 = vdup_n_u32(a10); // duplicate a10
412	va0 = vset_lane_u32(a01, va0, `1`); // set top to a01
413	va1 = vset_lane_u32(a11, va1, `1`); // set top to a11
414
415	tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01\|a00] (16-y)*
416	tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11\|a10] y*
417
418	vx = vdup_n_u16(x); // duplicate x into vx
419	vconst16_16 = vmov_n_u16(`16`); // set up constant in vconst16_16
420	v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
421
422	tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 x*
423	tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 x*
424	tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 (16-x)*
425	tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 (16-x)*
426
427	if (scale < `256`) {
428	vscale = vdup_n_u16(scale); // duplicate scale
429	tmp = vshr_n_u16(tmp, `8`); // shift down result by 8
430	tmp = vmul_u16(tmp, vscale); // multiply result by scale
431	}
432
433	vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)`0`)), `8`); // shift down result by 8
434	vst1_lane_u32(dst, vreinterpret_u32_u8(vres), `0`); // store result
435	}
436	#else
437	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
438	SkPMColor a00, SkPMColor a01,
439	SkPMColor a10, SkPMColor a11,
440	SkPMColor* dstColor,
441	unsigned alphaScale) {
442	SkASSERT((unsigned)x <= `0xF`);
443	SkASSERT((unsigned)y <= `0xF`);
444	SkASSERT(alphaScale <= `256`);
445
446	int xy = x * y;
447	const uint32_t mask = `0xFF00FF`;
448
449	int scale = `256` - `16`y - `16`x + xy;
450	uint32_t lo = (a00 & mask) * scale;
451	uint32_t hi = ((a00 >> `8`) & mask) * scale;
452
453	scale = `16`*x - xy;
454	lo += (a01 & mask) * scale;
455	hi += ((a01 >> `8`) & mask) * scale;
456
457	scale = `16`*y - xy;
458	lo += (a10 & mask) * scale;
459	hi += ((a10 >> `8`) & mask) * scale;
460
461	lo += (a11 & mask) * xy;
462	hi += ((a11 >> `8`) & mask) * xy;
463
464	if (alphaScale < `256`) {
465	lo = ((lo >> `8`) & mask) * alphaScale;
466	hi = ((hi >> `8`) & mask) * alphaScale;
467	}
468
469	*dstColor = ((lo >> `8`) & mask) \| (hi & ~mask);
470	}
471	#endif
472
473
474	/not static/ inline
475	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
476	const uint32_t* xy, int count, SkPMColor* colors) {
477	SkASSERT(count > `0` && colors != nullptr);
478	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
479	SkASSERT(`4` == s.fPixmap.info().bytesPerPixel());
480	SkASSERT(s.fAlphaScale <= `256`);
481
482	int y0, y1, wy;
483	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
484
485	auto row0 = (const uint32_t)( (const* char)s.fPixmap.addr() + y0 s.fPixmap.rowBytes() ),
486	row1 = (const uint32_t)( (const* char)s.fPixmap.addr() + y1 s.fPixmap.rowBytes() );
487
488	while (count --> `0`) {
489	int x0, x1, wx;
490	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
491
492	filter_and_scale_by_alpha(wx, wy,
493	row0[x0], row0[x1],
494	row1[x0], row1[x1],
495	colors++,
496	s.fAlphaScale);
497	}
498	}
499
500	#endif
501
502	#if defined(SK_ARM_HAS_NEON)
503	/not static/ inline
504	void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
505	const uint32_t* xy, int count, SkPMColor* colors) {
506	SkASSERT(count > `0` && colors != nullptr);
507	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
508	SkASSERT(`4` == s.fPixmap.info().bytesPerPixel());
509	SkASSERT(s.fAlphaScale <= `256`);
510
511	auto src = (const char*)s.fPixmap.addr();
512	size_t rb = s.fPixmap.rowBytes();
513
514	while (count --> `0`) {
515	int y0, y1, wy,
516	x0, x1, wx;
517	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
518	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
519
520	auto row0 = (const uint32_t)(src + y0rb),
521	row1 = (const uint32_t)(src + y1rb);
522
523	filter_and_scale_by_alpha(wx, wy,
524	row0[x0], row0[x1],
525	row1[x0], row1[x1],
526	colors++,
527	s.fAlphaScale);
528	}
529	}
530	#else
531	// It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2.
532	constexpr static void (S32_alpha_D32_filter_DXDY)(const* SkBitmapProcState&,
533	const uint32_t, int, SkPMColor) = nullptr;
534	#endif
535
536	} // namespace SK_OPTS_NS
537
538	#endif
539

Browse the source code of engine/third_party/skia/src/opts/SkBitmapProcState_opts.h