1/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBitmapProcState_opts_DEFINED
9#define SkBitmapProcState_opts_DEFINED
10
11#include "include/private/SkVx.h"
12#include "src/core/SkBitmapProcState.h"
13#include "src/core/SkMSAN.h"
14
15// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16//
17// Only S32_alpha_D32_filter_DX exploits instructions beyond
18// our common baseline SSE2/NEON instruction sets, so that's
19// all that lives here.
20//
21// The rest are scattershot at the moment but I want to get them
22// all migrated to be normal code inside SkBitmapProcState.cpp.
23
24#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25 #include <immintrin.h>
26#elif defined(SK_ARM_HAS_NEON)
27 #include <arm_neon.h>
28#endif
29
30namespace SK_OPTS_NS {
31
32// This same basic packing scheme is used throughout the file.
33template <typename U32, typename Out>
34static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
35 *v0 = (packed >> 18); // Integer coordinate x0 or y0.
36 *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1.
37 *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
38}
39
40#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
41 /*not static*/ inline
42 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
43 const uint32_t* xy, int count, uint32_t* colors) {
44 SkASSERT(count > 0 && colors != nullptr);
45 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
46 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
47 SkASSERT(s.fAlphaScale <= 256);
48
49 // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight.
50 int y0, y1, wy;
51 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
52
53 const uint32_t* row0 = s.fPixmap.addr32(0,y0);
54 const uint32_t* row1 = s.fPixmap.addr32(0,y1);
55
56 auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> {
57 // Decode up to 8 output pixels' x-coordinates and weights.
58 skvx::Vec<8,uint32_t> x0,x1,wx;
59 decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx);
60
61 // Splat wx to each color channel.
62 wx = (wx << 0)
63 | (wx << 8)
64 | (wx << 16)
65 | (wx << 24);
66
67 auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) {
68 #if 1
69 // Drop into AVX2 intrinsics for vpgatherdd.
70 return skvx::bit_pun<skvx::Vec<8,uint32_t>>(
71 _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4));
72 #else
73 // Portable version... sometimes I don't trust vpgatherdd.
74 return skvx::Vec<8,uint32_t>{
75 ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]],
76 ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]],
77 };
78 #endif
79 };
80
81 // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels.
82 skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1),
83 bl = gather(row1, x0), br = gather(row1, x1);
84
85 #if 1
86 // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code.
87 auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) {
88 __m256i l = skvx::bit_pun<__m256i>(L),
89 r = skvx::bit_pun<__m256i>(R),
90 wr = skvx::bit_pun<__m256i>(wx),
91 wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr);
92
93 // Interlace l,r bytewise and line them up with their weights, then lerp.
94 __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r),
95 _mm256_unpacklo_epi8(wl,wr));
96 __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r),
97 _mm256_unpackhi_epi8(wl,wr));
98
99 // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order:
100 //
101 // if l = a b c d | e f g h
102 // and r = A B C D | E F G H
103 //
104 // then lo = a A b B | e E f F (low half of each input)
105 // and hi = c C d D | g G h H (high half of each input)
106 //
107 // To get everything back in original order we need to transpose that.
108 __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20),
109 efgh = _mm256_permute2x128_si256(lo, hi, 0x31);
110
111 return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd),
112 skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh));
113 };
114
115 skvx::Vec<32, uint16_t> top = lerp_x(tl, tr),
116 bot = lerp_x(bl, br),
117 sum = 16*top + (bot-top)*wy;
118 #else
119 // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply.
120 auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> {
121 return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v));
122 };
123
124 // Sum up weighted sample pixels. The naive, redundant math would be,
125 //
126 // sum = tl * (16-wy) * (16-wx)
127 // + bl * ( wy) * (16-wx)
128 // + tr * (16-wy) * ( wx)
129 // + br * ( wy) * ( wx)
130 //
131 // But we refactor to eliminate a bunch of those common factors.
132 auto lerp = [](auto lo, auto hi, auto w) {
133 return 16*lo + (hi-lo)*w;
134 };
135 skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy),
136 lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx));
137 #endif
138
139 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
140 sum >>= 8;
141
142 // Scale by [0,256] alpha.
143 sum *= s.fAlphaScale;
144 sum >>= 8;
145
146 // Pack back to 8-bit channels, undoing to_16x4().
147 return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum));
148 };
149
150 while (count >= 8) {
151 bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors);
152 xy += 8;
153 colors += 8;
154 count -= 8;
155 }
156 if (count > 0) {
157 __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ),
158 coords = _mm256_maskload_epi32((const int*)xy, active),
159 pixels;
160
161 bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels);
162 _mm256_maskstore_epi32((int*)colors, active, pixels);
163
164 sk_msan_mark_initialized(colors, colors+count,
165 "MSAN still doesn't understand AVX2 mask loads and stores.");
166 }
167 }
168
169#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
170
171 /*not static*/ inline
172 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
173 const uint32_t* xy, int count, uint32_t* colors) {
174 SkASSERT(count > 0 && colors != nullptr);
175 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
176 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
177 SkASSERT(s.fAlphaScale <= 256);
178
179 // interpolate_in_x() is the crux of the SSSE3 implementation,
180 // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
181 auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
182 uint32_t B0, uint32_t B1,
183 __m128i interlaced_x_weights) {
184 // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
185 //
186 // It takes two arguments interlaced byte-wise:
187 // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
188 // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]
189 // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
190 //
191 // That's why we go to all this trouble to make interlaced_x_weights,
192 // and here we're about to interlace A0 with A1 and B0 with B1 to match.
193 //
194 // Our interlaced_x_weights are all in [0,16], and so we need not worry about
195 // the signedness of that input nor about the signedness of the output.
196
197 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
198 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
199
200 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
201 interlaced_x_weights);
202 };
203
204 // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
205 // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
206 auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
207 uint32_t A2, uint32_t A3,
208 uint32_t B0, uint32_t B1,
209 uint32_t B2, uint32_t B3,
210 __m128i interlaced_x_weights,
211 int wy) {
212 // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
213 __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
214 bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
215
216 // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy
217 // as 16*top + (bot-top)*wy to save a multiply.
218 __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
219 _mm_mullo_epi16(_mm_sub_epi16(bot, top),
220 _mm_set1_epi16(wy)));
221
222 // Scale down by total max weight 16x16 = 256.
223 px = _mm_srli_epi16(px, 8);
224
225 // Scale by alpha if needed.
226 if (s.fAlphaScale < 256) {
227 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
228 }
229 return px;
230 };
231
232 // We're in _DX mode here, so we're only varying in X.
233 // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
234 // All the other entries in xy will be pairs of X coordinates and the X weight.
235 int y0, y1, wy;
236 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
237
238 auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
239 row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
240
241 while (count >= 4) {
242 // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
243 int x0[4],
244 x1[4];
245 __m128i wx;
246
247 // decode_packed_coordinates_and_weight(), 4x.
248 __m128i packed = _mm_loadu_si128((const __m128i*)xy);
249 _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
250 _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
251 wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15]
252
253 // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
254 // and sixteen minus that as wl for pixels on the left at x0.
255 __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
256 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
257
258 // We need to interlace wl and wr for _mm_maddubs_epi16().
259 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
260 interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
261
262 enum { A,B,C,D };
263
264 // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
265 // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
266 __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
267 row1[x0[A]], row1[x1[A]],
268 row0[x0[B]], row0[x1[B]],
269 row1[x0[B]], row1[x1[B]],
270 interlaced_x_weights_AB, wy);
271
272 // Once more with the other half of the x-weights for two more pixels C,D.
273 __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
274 row1[x0[C]], row1[x1[C]],
275 row0[x0[D]], row0[x1[D]],
276 row1[x0[D]], row1[x1[D]],
277 interlaced_x_weights_CD, wy);
278
279 // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
280 _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
281 xy += 4;
282 colors += 4;
283 count -= 4;
284 }
285
286 while (count --> 0) {
287 // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
288 int x0, x1, wx;
289 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
290
291 // As above, splat out wx four times as wr, and sixteen minus that as wl.
292 __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
293 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
294
295 __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
296
297 __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
298 row1[x0], row1[x1],
299 0, 0,
300 0, 0,
301 interlaced_x_weights, wy);
302
303 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
304 }
305 }
306
307
308#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
309
310 /*not static*/ inline
311 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
312 const uint32_t* xy, int count, uint32_t* colors) {
313 SkASSERT(count > 0 && colors != nullptr);
314 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
315 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
316 SkASSERT(s.fAlphaScale <= 256);
317
318 int y0, y1, wy;
319 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
320
321 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
322 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
323
324 // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
325 // and another in the upper 4 16-bit lanes to line up with 16 - wy.
326 const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
327 _mm_set1_epi16(16-wy)); // Top pixel goes here.
328
329 while (count --> 0) {
330 int x0, x1, wx;
331 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
332
333 // Load the 4 pixels we're interpolating, in this grid:
334 // | tl tr |
335 // | bl br |
336 const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
337 bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
338
339 // We want to calculate a sum of 4 pixels weighted in two directions:
340 //
341 // sum = tl * (16-wy) * (16-wx)
342 // + bl * ( wy) * (16-wx)
343 // + tr * (16-wy) * ( wx)
344 // + br * ( wy) * ( wx)
345 //
346 // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
347 //
348 // We've already prepared allY as a vector containing [wy, 16-wy] as a way
349 // to apply those y-direction weights. So we'll start on the x-direction
350 // first, grouping into left and right halves, lined up with allY:
351 //
352 // L = [bl, tl]
353 // R = [br, tr]
354 //
355 // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
356 //
357 // Rewriting that one more step, we can replace a multiply with a shift:
358 //
359 // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
360 //
361 // That's how we'll actually do this math.
362
363 __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
364 R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
365
366 __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
367 _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
368
369 __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
370
371 // sum = horizontalSum( ... )
372 __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
373
374 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
375 sum = _mm_srli_epi16(sum, 8);
376
377 if (s.fAlphaScale < 256) {
378 // Scale by alpha, which is in [0,256].
379 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
380 sum = _mm_srli_epi16(sum, 8);
381 }
382
383 // Pack back into 8-bit values and store.
384 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
385 }
386 }
387
388#else
389
390 // The NEON code only actually differs from the portable code in the
391 // filtering step after we've loaded all four pixels we want to bilerp.
392
393 #if defined(SK_ARM_HAS_NEON)
394 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
395 SkPMColor a00, SkPMColor a01,
396 SkPMColor a10, SkPMColor a11,
397 SkPMColor *dst,
398 uint16_t scale) {
399 uint8x8_t vy, vconst16_8, v16_y, vres;
400 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
401 uint32x2_t va0, va1;
402 uint16x8_t tmp1, tmp2;
403
404 vy = vdup_n_u8(y); // duplicate y into vy
405 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
406 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
407
408 va0 = vdup_n_u32(a00); // duplicate a00
409 va1 = vdup_n_u32(a10); // duplicate a10
410 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
411 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
412
413 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
414 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
415
416 vx = vdup_n_u16(x); // duplicate x into vx
417 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
418 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
419
420 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
421 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
422 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
423 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
424
425 if (scale < 256) {
426 vscale = vdup_n_u16(scale); // duplicate scale
427 tmp = vshr_n_u16(tmp, 8); // shift down result by 8
428 tmp = vmul_u16(tmp, vscale); // multiply result by scale
429 }
430
431 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
432 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
433 }
434 #else
435 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
436 SkPMColor a00, SkPMColor a01,
437 SkPMColor a10, SkPMColor a11,
438 SkPMColor* dstColor,
439 unsigned alphaScale) {
440 SkASSERT((unsigned)x <= 0xF);
441 SkASSERT((unsigned)y <= 0xF);
442 SkASSERT(alphaScale <= 256);
443
444 int xy = x * y;
445 const uint32_t mask = 0xFF00FF;
446
447 int scale = 256 - 16*y - 16*x + xy;
448 uint32_t lo = (a00 & mask) * scale;
449 uint32_t hi = ((a00 >> 8) & mask) * scale;
450
451 scale = 16*x - xy;
452 lo += (a01 & mask) * scale;
453 hi += ((a01 >> 8) & mask) * scale;
454
455 scale = 16*y - xy;
456 lo += (a10 & mask) * scale;
457 hi += ((a10 >> 8) & mask) * scale;
458
459 lo += (a11 & mask) * xy;
460 hi += ((a11 >> 8) & mask) * xy;
461
462 if (alphaScale < 256) {
463 lo = ((lo >> 8) & mask) * alphaScale;
464 hi = ((hi >> 8) & mask) * alphaScale;
465 }
466
467 *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
468 }
469 #endif
470
471
472 /*not static*/ inline
473 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
474 const uint32_t* xy, int count, SkPMColor* colors) {
475 SkASSERT(count > 0 && colors != nullptr);
476 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
477 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
478 SkASSERT(s.fAlphaScale <= 256);
479
480 int y0, y1, wy;
481 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
482
483 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
484 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
485
486 while (count --> 0) {
487 int x0, x1, wx;
488 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
489
490 filter_and_scale_by_alpha(wx, wy,
491 row0[x0], row0[x1],
492 row1[x0], row1[x1],
493 colors++,
494 s.fAlphaScale);
495 }
496 }
497
498#endif
499
500#if defined(SK_ARM_HAS_NEON)
501 /*not static*/ inline
502 void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
503 const uint32_t* xy, int count, SkPMColor* colors) {
504 SkASSERT(count > 0 && colors != nullptr);
505 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
506 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
507 SkASSERT(s.fAlphaScale <= 256);
508
509 auto src = (const char*)s.fPixmap.addr();
510 size_t rb = s.fPixmap.rowBytes();
511
512 while (count --> 0) {
513 int y0, y1, wy,
514 x0, x1, wx;
515 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
516 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
517
518 auto row0 = (const uint32_t*)(src + y0*rb),
519 row1 = (const uint32_t*)(src + y1*rb);
520
521 filter_and_scale_by_alpha(wx, wy,
522 row0[x0], row0[x1],
523 row1[x0], row1[x1],
524 colors++,
525 s.fAlphaScale);
526 }
527 }
528#else
529 // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2.
530 constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
531 const uint32_t*, int, SkPMColor*) = nullptr;
532#endif
533
534} // namespace SK_OPTS_NS
535
536#endif
537