1 | /* |
2 | * Copyright 2018 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkBitmapProcState_opts_DEFINED |
9 | #define SkBitmapProcState_opts_DEFINED |
10 | |
11 | #include "include/private/SkVx.h" |
12 | #include "src/core/SkBitmapProcState.h" |
13 | #include "src/core/SkMSAN.h" |
14 | |
15 | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
16 | // |
17 | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
18 | // our common baseline SSE2/NEON instruction sets, so that's |
19 | // all that lives here. |
20 | // |
21 | // The rest are scattershot at the moment but I want to get them |
22 | // all migrated to be normal code inside SkBitmapProcState.cpp. |
23 | |
24 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
25 | #include <immintrin.h> |
26 | #elif defined(SK_ARM_HAS_NEON) |
27 | #include <arm_neon.h> |
28 | #endif |
29 | |
30 | namespace SK_OPTS_NS { |
31 | |
32 | // This same basic packing scheme is used throughout the file. |
33 | template <typename U32, typename Out> |
34 | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
35 | *v0 = (packed >> 18); // Integer coordinate x0 or y0. |
36 | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. |
37 | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. |
38 | } |
39 | |
40 | #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
41 | /*not static*/ inline |
42 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
43 | const uint32_t* xy, int count, uint32_t* colors) { |
44 | SkASSERT(count > 0 && colors != nullptr); |
45 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
46 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
47 | SkASSERT(s.fAlphaScale <= 256); |
48 | |
49 | // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight. |
50 | int y0, y1, wy; |
51 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
52 | |
53 | const uint32_t* row0 = s.fPixmap.addr32(0,y0); |
54 | const uint32_t* row1 = s.fPixmap.addr32(0,y1); |
55 | |
56 | auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> { |
57 | // Decode up to 8 output pixels' x-coordinates and weights. |
58 | skvx::Vec<8,uint32_t> x0,x1,wx; |
59 | decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx); |
60 | |
61 | // Splat wx to each color channel. |
62 | wx = (wx << 0) |
63 | | (wx << 8) |
64 | | (wx << 16) |
65 | | (wx << 24); |
66 | |
67 | auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) { |
68 | #if 1 |
69 | // Drop into AVX2 intrinsics for vpgatherdd. |
70 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>( |
71 | _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4)); |
72 | #else |
73 | // Portable version... sometimes I don't trust vpgatherdd. |
74 | return skvx::Vec<8,uint32_t>{ |
75 | ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]], |
76 | ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]], |
77 | }; |
78 | #endif |
79 | }; |
80 | |
81 | // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels. |
82 | skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1), |
83 | bl = gather(row1, x0), br = gather(row1, x1); |
84 | |
85 | #if 1 |
86 | // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code. |
87 | auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) { |
88 | __m256i l = skvx::bit_pun<__m256i>(L), |
89 | r = skvx::bit_pun<__m256i>(R), |
90 | wr = skvx::bit_pun<__m256i>(wx), |
91 | wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr); |
92 | |
93 | // Interlace l,r bytewise and line them up with their weights, then lerp. |
94 | __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r), |
95 | _mm256_unpacklo_epi8(wl,wr)); |
96 | __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r), |
97 | _mm256_unpackhi_epi8(wl,wr)); |
98 | |
99 | // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order: |
100 | // |
101 | // if l = a b c d | e f g h |
102 | // and r = A B C D | E F G H |
103 | // |
104 | // then lo = a A b B | e E f F (low half of each input) |
105 | // and hi = c C d D | g G h H (high half of each input) |
106 | // |
107 | // To get everything back in original order we need to transpose that. |
108 | __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20), |
109 | efgh = _mm256_permute2x128_si256(lo, hi, 0x31); |
110 | |
111 | return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd), |
112 | skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh)); |
113 | }; |
114 | |
115 | skvx::Vec<32, uint16_t> top = lerp_x(tl, tr), |
116 | bot = lerp_x(bl, br), |
117 | sum = 16*top + (bot-top)*wy; |
118 | #else |
119 | // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply. |
120 | auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> { |
121 | return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v)); |
122 | }; |
123 | |
124 | // Sum up weighted sample pixels. The naive, redundant math would be, |
125 | // |
126 | // sum = tl * (16-wy) * (16-wx) |
127 | // + bl * ( wy) * (16-wx) |
128 | // + tr * (16-wy) * ( wx) |
129 | // + br * ( wy) * ( wx) |
130 | // |
131 | // But we refactor to eliminate a bunch of those common factors. |
132 | auto lerp = [](auto lo, auto hi, auto w) { |
133 | return 16*lo + (hi-lo)*w; |
134 | }; |
135 | skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy), |
136 | lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx)); |
137 | #endif |
138 | |
139 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
140 | sum >>= 8; |
141 | |
142 | // Scale by [0,256] alpha. |
143 | sum *= s.fAlphaScale; |
144 | sum >>= 8; |
145 | |
146 | // Pack back to 8-bit channels, undoing to_16x4(). |
147 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum)); |
148 | }; |
149 | |
150 | while (count >= 8) { |
151 | bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors); |
152 | xy += 8; |
153 | colors += 8; |
154 | count -= 8; |
155 | } |
156 | if (count > 0) { |
157 | __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ), |
158 | coords = _mm256_maskload_epi32((const int*)xy, active), |
159 | pixels; |
160 | |
161 | bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels); |
162 | _mm256_maskstore_epi32((int*)colors, active, pixels); |
163 | |
164 | sk_msan_mark_initialized(colors, colors+count, |
165 | "MSAN still doesn't understand AVX2 mask loads and stores." ); |
166 | } |
167 | } |
168 | |
169 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
170 | |
171 | /*not static*/ inline |
172 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
173 | const uint32_t* xy, int count, uint32_t* colors) { |
174 | SkASSERT(count > 0 && colors != nullptr); |
175 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
176 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
177 | SkASSERT(s.fAlphaScale <= 256); |
178 | |
179 | // interpolate_in_x() is the crux of the SSSE3 implementation, |
180 | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
181 | auto interpolate_in_x = [](uint32_t A0, uint32_t A1, |
182 | uint32_t B0, uint32_t B1, |
183 | __m128i interlaced_x_weights) { |
184 | // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. |
185 | // |
186 | // It takes two arguments interlaced byte-wise: |
187 | // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] |
188 | // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] |
189 | // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. |
190 | // |
191 | // That's why we go to all this trouble to make interlaced_x_weights, |
192 | // and here we're about to interlace A0 with A1 and B0 with B1 to match. |
193 | // |
194 | // Our interlaced_x_weights are all in [0,16], and so we need not worry about |
195 | // the signedness of that input nor about the signedness of the output. |
196 | |
197 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
198 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
199 | |
200 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
201 | interlaced_x_weights); |
202 | }; |
203 | |
204 | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
205 | // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. |
206 | auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1, |
207 | uint32_t A2, uint32_t A3, |
208 | uint32_t B0, uint32_t B1, |
209 | uint32_t B2, uint32_t B3, |
210 | __m128i interlaced_x_weights, |
211 | int wy) { |
212 | // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights. |
213 | __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
214 | bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
215 | |
216 | // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy |
217 | // as 16*top + (bot-top)*wy to save a multiply. |
218 | __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4), |
219 | _mm_mullo_epi16(_mm_sub_epi16(bot, top), |
220 | _mm_set1_epi16(wy))); |
221 | |
222 | // Scale down by total max weight 16x16 = 256. |
223 | px = _mm_srli_epi16(px, 8); |
224 | |
225 | // Scale by alpha if needed. |
226 | if (s.fAlphaScale < 256) { |
227 | px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8); |
228 | } |
229 | return px; |
230 | }; |
231 | |
232 | // We're in _DX mode here, so we're only varying in X. |
233 | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
234 | // All the other entries in xy will be pairs of X coordinates and the X weight. |
235 | int y0, y1, wy; |
236 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
237 | |
238 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
239 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
240 | |
241 | while (count >= 4) { |
242 | // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels. |
243 | int x0[4], |
244 | x1[4]; |
245 | __m128i wx; |
246 | |
247 | // decode_packed_coordinates_and_weight(), 4x. |
248 | __m128i packed = _mm_loadu_si128((const __m128i*)xy); |
249 | _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18)); |
250 | _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
251 | wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15] |
252 | |
253 | // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, |
254 | // and sixteen minus that as wl for pixels on the left at x0. |
255 | __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
256 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
257 | |
258 | // We need to interlace wl and wr for _mm_maddubs_epi16(). |
259 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr), |
260 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr); |
261 | |
262 | enum { A,B,C,D }; |
263 | |
264 | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
265 | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
266 | __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]], |
267 | row1[x0[A]], row1[x1[A]], |
268 | row0[x0[B]], row0[x1[B]], |
269 | row1[x0[B]], row1[x1[B]], |
270 | interlaced_x_weights_AB, wy); |
271 | |
272 | // Once more with the other half of the x-weights for two more pixels C,D. |
273 | __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]], |
274 | row1[x0[C]], row1[x1[C]], |
275 | row0[x0[D]], row0[x1[D]], |
276 | row1[x0[D]], row1[x1[D]], |
277 | interlaced_x_weights_CD, wy); |
278 | |
279 | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
280 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD)); |
281 | xy += 4; |
282 | colors += 4; |
283 | count -= 4; |
284 | } |
285 | |
286 | while (count --> 0) { |
287 | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
288 | int x0, x1, wx; |
289 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
290 | |
291 | // As above, splat out wx four times as wr, and sixteen minus that as wl. |
292 | __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
293 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
294 | |
295 | __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr); |
296 | |
297 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
298 | row1[x0], row1[x1], |
299 | 0, 0, |
300 | 0, 0, |
301 | interlaced_x_weights, wy); |
302 | |
303 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128())); |
304 | } |
305 | } |
306 | |
307 | |
308 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
309 | |
310 | /*not static*/ inline |
311 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
312 | const uint32_t* xy, int count, uint32_t* colors) { |
313 | SkASSERT(count > 0 && colors != nullptr); |
314 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
315 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
316 | SkASSERT(s.fAlphaScale <= 256); |
317 | |
318 | int y0, y1, wy; |
319 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
320 | |
321 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
322 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
323 | |
324 | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
325 | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
326 | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. |
327 | _mm_set1_epi16(16-wy)); // Top pixel goes here. |
328 | |
329 | while (count --> 0) { |
330 | int x0, x1, wx; |
331 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
332 | |
333 | // Load the 4 pixels we're interpolating, in this grid: |
334 | // | tl tr | |
335 | // | bl br | |
336 | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), |
337 | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); |
338 | |
339 | // We want to calculate a sum of 4 pixels weighted in two directions: |
340 | // |
341 | // sum = tl * (16-wy) * (16-wx) |
342 | // + bl * ( wy) * (16-wx) |
343 | // + tr * (16-wy) * ( wx) |
344 | // + br * ( wy) * ( wx) |
345 | // |
346 | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
347 | // |
348 | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
349 | // to apply those y-direction weights. So we'll start on the x-direction |
350 | // first, grouping into left and right halves, lined up with allY: |
351 | // |
352 | // L = [bl, tl] |
353 | // R = [br, tr] |
354 | // |
355 | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
356 | // |
357 | // Rewriting that one more step, we can replace a multiply with a shift: |
358 | // |
359 | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
360 | // |
361 | // That's how we'll actually do this math. |
362 | |
363 | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), |
364 | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); |
365 | |
366 | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), |
367 | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); |
368 | |
369 | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); |
370 | |
371 | // sum = horizontalSum( ... ) |
372 | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); |
373 | |
374 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
375 | sum = _mm_srli_epi16(sum, 8); |
376 | |
377 | if (s.fAlphaScale < 256) { |
378 | // Scale by alpha, which is in [0,256]. |
379 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
380 | sum = _mm_srli_epi16(sum, 8); |
381 | } |
382 | |
383 | // Pack back into 8-bit values and store. |
384 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
385 | } |
386 | } |
387 | |
388 | #else |
389 | |
390 | // The NEON code only actually differs from the portable code in the |
391 | // filtering step after we've loaded all four pixels we want to bilerp. |
392 | |
393 | #if defined(SK_ARM_HAS_NEON) |
394 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
395 | SkPMColor a00, SkPMColor a01, |
396 | SkPMColor a10, SkPMColor a11, |
397 | SkPMColor *dst, |
398 | uint16_t scale) { |
399 | uint8x8_t vy, vconst16_8, v16_y, vres; |
400 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
401 | uint32x2_t va0, va1; |
402 | uint16x8_t tmp1, tmp2; |
403 | |
404 | vy = vdup_n_u8(y); // duplicate y into vy |
405 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
406 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
407 | |
408 | va0 = vdup_n_u32(a00); // duplicate a00 |
409 | va1 = vdup_n_u32(a10); // duplicate a10 |
410 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
411 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
412 | |
413 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
414 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
415 | |
416 | vx = vdup_n_u16(x); // duplicate x into vx |
417 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
418 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
419 | |
420 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
421 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
422 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
423 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
424 | |
425 | if (scale < 256) { |
426 | vscale = vdup_n_u16(scale); // duplicate scale |
427 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
428 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
429 | } |
430 | |
431 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8 |
432 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
433 | } |
434 | #else |
435 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
436 | SkPMColor a00, SkPMColor a01, |
437 | SkPMColor a10, SkPMColor a11, |
438 | SkPMColor* dstColor, |
439 | unsigned alphaScale) { |
440 | SkASSERT((unsigned)x <= 0xF); |
441 | SkASSERT((unsigned)y <= 0xF); |
442 | SkASSERT(alphaScale <= 256); |
443 | |
444 | int xy = x * y; |
445 | const uint32_t mask = 0xFF00FF; |
446 | |
447 | int scale = 256 - 16*y - 16*x + xy; |
448 | uint32_t lo = (a00 & mask) * scale; |
449 | uint32_t hi = ((a00 >> 8) & mask) * scale; |
450 | |
451 | scale = 16*x - xy; |
452 | lo += (a01 & mask) * scale; |
453 | hi += ((a01 >> 8) & mask) * scale; |
454 | |
455 | scale = 16*y - xy; |
456 | lo += (a10 & mask) * scale; |
457 | hi += ((a10 >> 8) & mask) * scale; |
458 | |
459 | lo += (a11 & mask) * xy; |
460 | hi += ((a11 >> 8) & mask) * xy; |
461 | |
462 | if (alphaScale < 256) { |
463 | lo = ((lo >> 8) & mask) * alphaScale; |
464 | hi = ((hi >> 8) & mask) * alphaScale; |
465 | } |
466 | |
467 | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
468 | } |
469 | #endif |
470 | |
471 | |
472 | /*not static*/ inline |
473 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
474 | const uint32_t* xy, int count, SkPMColor* colors) { |
475 | SkASSERT(count > 0 && colors != nullptr); |
476 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
477 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
478 | SkASSERT(s.fAlphaScale <= 256); |
479 | |
480 | int y0, y1, wy; |
481 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
482 | |
483 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
484 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
485 | |
486 | while (count --> 0) { |
487 | int x0, x1, wx; |
488 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
489 | |
490 | filter_and_scale_by_alpha(wx, wy, |
491 | row0[x0], row0[x1], |
492 | row1[x0], row1[x1], |
493 | colors++, |
494 | s.fAlphaScale); |
495 | } |
496 | } |
497 | |
498 | #endif |
499 | |
500 | #if defined(SK_ARM_HAS_NEON) |
501 | /*not static*/ inline |
502 | void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s, |
503 | const uint32_t* xy, int count, SkPMColor* colors) { |
504 | SkASSERT(count > 0 && colors != nullptr); |
505 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
506 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
507 | SkASSERT(s.fAlphaScale <= 256); |
508 | |
509 | auto src = (const char*)s.fPixmap.addr(); |
510 | size_t rb = s.fPixmap.rowBytes(); |
511 | |
512 | while (count --> 0) { |
513 | int y0, y1, wy, |
514 | x0, x1, wx; |
515 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
516 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
517 | |
518 | auto row0 = (const uint32_t*)(src + y0*rb), |
519 | row1 = (const uint32_t*)(src + y1*rb); |
520 | |
521 | filter_and_scale_by_alpha(wx, wy, |
522 | row0[x0], row0[x1], |
523 | row1[x0], row1[x1], |
524 | colors++, |
525 | s.fAlphaScale); |
526 | } |
527 | } |
528 | #else |
529 | // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2. |
530 | constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&, |
531 | const uint32_t*, int, SkPMColor*) = nullptr; |
532 | #endif |
533 | |
534 | } // namespace SK_OPTS_NS |
535 | |
536 | #endif |
537 | |