1 | /* |
2 | * Copyright 2018 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkBitmapProcState_opts_DEFINED |
9 | #define SkBitmapProcState_opts_DEFINED |
10 | |
11 | #include "include/private/SkVx.h" |
12 | #include "src/core/SkBitmapProcState.h" |
13 | #include "src/core/SkMSAN.h" |
14 | |
15 | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
16 | // |
17 | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
18 | // our common baseline SSE2/NEON instruction sets, so that's |
19 | // all that lives here. |
20 | // |
21 | // The rest are scattershot at the moment but I want to get them |
22 | // all migrated to be normal code inside SkBitmapProcState.cpp. |
23 | |
24 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
25 | #include <immintrin.h> |
26 | #elif defined(SK_ARM_HAS_NEON) |
27 | #include <arm_neon.h> |
28 | #endif |
29 | |
30 | namespace SK_OPTS_NS { |
31 | |
32 | // This same basic packing scheme is used throughout the file. |
33 | template <typename U32, typename Out> |
34 | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
35 | *v0 = (packed >> 18); // Integer coordinate x0 or y0. |
36 | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. |
37 | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. |
38 | } |
39 | |
40 | #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
41 | /*not static*/ inline |
42 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
43 | const uint32_t* xy, int count, uint32_t* colors) { |
44 | SkASSERT(count > 0 && colors != nullptr); |
45 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
46 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
47 | SkASSERT(s.fAlphaScale <= 256); |
48 | |
49 | // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight. |
50 | int y0, y1, wy; |
51 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
52 | |
53 | const uint32_t* row0 = s.fPixmap.addr32(0,y0); |
54 | const uint32_t* row1 = s.fPixmap.addr32(0,y1); |
55 | |
56 | auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> { |
57 | // Decode up to 8 output pixels' x-coordinates and weights. |
58 | skvx::Vec<8,uint32_t> x0,x1,wx; |
59 | decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx); |
60 | |
61 | // Splat wx to each color channel. |
62 | wx = (wx << 0) |
63 | | (wx << 8) |
64 | | (wx << 16) |
65 | | (wx << 24); |
66 | |
67 | auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) { |
68 | #if 1 |
69 | // Drop into AVX2 intrinsics for vpgatherdd. |
70 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>( |
71 | _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4)); |
72 | #else |
73 | // Portable version... sometimes I don't trust vpgatherdd. |
74 | return skvx::Vec<8,uint32_t>{ |
75 | ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]], |
76 | ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]], |
77 | }; |
78 | #endif |
79 | }; |
80 | |
81 | // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels. |
82 | skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1), |
83 | bl = gather(row1, x0), br = gather(row1, x1); |
84 | |
85 | #if 1 |
86 | // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code. |
87 | auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) { |
88 | __m256i l = skvx::bit_pun<__m256i>(L), |
89 | r = skvx::bit_pun<__m256i>(R), |
90 | wr = skvx::bit_pun<__m256i>(wx), |
91 | wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr); |
92 | |
93 | // Interlace l,r bytewise and line them up with their weights, then lerp. |
94 | __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r), |
95 | _mm256_unpacklo_epi8(wl,wr)); |
96 | __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r), |
97 | _mm256_unpackhi_epi8(wl,wr)); |
98 | |
99 | // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order: |
100 | // |
101 | // if l = a b c d | e f g h |
102 | // and r = A B C D | E F G H |
103 | // |
104 | // then lo = a A b B | e E f F (low half of each input) |
105 | // and hi = c C d D | g G h H (high half of each input) |
106 | // |
107 | // To get everything back in original order we need to transpose that. |
108 | __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20), |
109 | efgh = _mm256_permute2x128_si256(lo, hi, 0x31); |
110 | |
111 | return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd), |
112 | skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh)); |
113 | }; |
114 | |
115 | skvx::Vec<32, uint16_t> top = lerp_x(tl, tr), |
116 | bot = lerp_x(bl, br), |
117 | sum = 16*top + (bot-top)*wy; |
118 | #else |
119 | // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply. |
120 | auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> { |
121 | return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v)); |
122 | }; |
123 | |
124 | // Sum up weighted sample pixels. The naive, redundant math would be, |
125 | // |
126 | // sum = tl * (16-wy) * (16-wx) |
127 | // + bl * ( wy) * (16-wx) |
128 | // + tr * (16-wy) * ( wx) |
129 | // + br * ( wy) * ( wx) |
130 | // |
131 | // But we refactor to eliminate a bunch of those common factors. |
132 | auto lerp = [](auto lo, auto hi, auto w) { |
133 | return 16*lo + (hi-lo)*w; |
134 | }; |
135 | skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy), |
136 | lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx)); |
137 | #endif |
138 | |
139 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
140 | sum >>= 8; |
141 | |
142 | // Scale by alpha if needed. |
143 | if(s.fAlphaScale < 256) { |
144 | sum *= s.fAlphaScale; |
145 | sum >>= 8; |
146 | } |
147 | |
148 | // Pack back to 8-bit channels, undoing to_16x4(). |
149 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum)); |
150 | }; |
151 | |
152 | while (count >= 8) { |
153 | bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors); |
154 | xy += 8; |
155 | colors += 8; |
156 | count -= 8; |
157 | } |
158 | if (count > 0) { |
159 | __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ), |
160 | coords = _mm256_maskload_epi32((const int*)xy, active), |
161 | pixels; |
162 | |
163 | bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels); |
164 | _mm256_maskstore_epi32((int*)colors, active, pixels); |
165 | |
166 | sk_msan_mark_initialized(colors, colors+count, |
167 | "MSAN still doesn't understand AVX2 mask loads and stores." ); |
168 | } |
169 | } |
170 | |
171 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
172 | |
173 | /*not static*/ inline |
174 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
175 | const uint32_t* xy, int count, uint32_t* colors) { |
176 | SkASSERT(count > 0 && colors != nullptr); |
177 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
178 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
179 | SkASSERT(s.fAlphaScale <= 256); |
180 | |
181 | // interpolate_in_x() is the crux of the SSSE3 implementation, |
182 | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
183 | auto interpolate_in_x = [](uint32_t A0, uint32_t A1, |
184 | uint32_t B0, uint32_t B1, |
185 | __m128i interlaced_x_weights) { |
186 | // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. |
187 | // |
188 | // It takes two arguments interlaced byte-wise: |
189 | // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] |
190 | // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] |
191 | // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. |
192 | // |
193 | // That's why we go to all this trouble to make interlaced_x_weights, |
194 | // and here we're about to interlace A0 with A1 and B0 with B1 to match. |
195 | // |
196 | // Our interlaced_x_weights are all in [0,16], and so we need not worry about |
197 | // the signedness of that input nor about the signedness of the output. |
198 | |
199 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
200 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
201 | |
202 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
203 | interlaced_x_weights); |
204 | }; |
205 | |
206 | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
207 | // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. |
208 | auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1, |
209 | uint32_t A2, uint32_t A3, |
210 | uint32_t B0, uint32_t B1, |
211 | uint32_t B2, uint32_t B3, |
212 | __m128i interlaced_x_weights, |
213 | int wy) { |
214 | // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights. |
215 | __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
216 | bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
217 | |
218 | // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy |
219 | // as 16*top + (bot-top)*wy to save a multiply. |
220 | __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4), |
221 | _mm_mullo_epi16(_mm_sub_epi16(bot, top), |
222 | _mm_set1_epi16(wy))); |
223 | |
224 | // Scale down by total max weight 16x16 = 256. |
225 | px = _mm_srli_epi16(px, 8); |
226 | |
227 | // Scale by alpha if needed. |
228 | if (s.fAlphaScale < 256) { |
229 | px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8); |
230 | } |
231 | return px; |
232 | }; |
233 | |
234 | // We're in _DX mode here, so we're only varying in X. |
235 | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
236 | // All the other entries in xy will be pairs of X coordinates and the X weight. |
237 | int y0, y1, wy; |
238 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
239 | |
240 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
241 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
242 | |
243 | while (count >= 4) { |
244 | // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels. |
245 | int x0[4], |
246 | x1[4]; |
247 | __m128i wx; |
248 | |
249 | // decode_packed_coordinates_and_weight(), 4x. |
250 | __m128i packed = _mm_loadu_si128((const __m128i*)xy); |
251 | _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18)); |
252 | _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
253 | wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15] |
254 | |
255 | // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, |
256 | // and sixteen minus that as wl for pixels on the left at x0. |
257 | __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
258 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
259 | |
260 | // We need to interlace wl and wr for _mm_maddubs_epi16(). |
261 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr), |
262 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr); |
263 | |
264 | enum { A,B,C,D }; |
265 | |
266 | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
267 | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
268 | __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]], |
269 | row1[x0[A]], row1[x1[A]], |
270 | row0[x0[B]], row0[x1[B]], |
271 | row1[x0[B]], row1[x1[B]], |
272 | interlaced_x_weights_AB, wy); |
273 | |
274 | // Once more with the other half of the x-weights for two more pixels C,D. |
275 | __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]], |
276 | row1[x0[C]], row1[x1[C]], |
277 | row0[x0[D]], row0[x1[D]], |
278 | row1[x0[D]], row1[x1[D]], |
279 | interlaced_x_weights_CD, wy); |
280 | |
281 | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
282 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD)); |
283 | xy += 4; |
284 | colors += 4; |
285 | count -= 4; |
286 | } |
287 | |
288 | while (count --> 0) { |
289 | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
290 | int x0, x1, wx; |
291 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
292 | |
293 | // As above, splat out wx four times as wr, and sixteen minus that as wl. |
294 | __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
295 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
296 | |
297 | __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr); |
298 | |
299 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
300 | row1[x0], row1[x1], |
301 | 0, 0, |
302 | 0, 0, |
303 | interlaced_x_weights, wy); |
304 | |
305 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128())); |
306 | } |
307 | } |
308 | |
309 | |
310 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
311 | |
312 | /*not static*/ inline |
313 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
314 | const uint32_t* xy, int count, uint32_t* colors) { |
315 | SkASSERT(count > 0 && colors != nullptr); |
316 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
317 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
318 | SkASSERT(s.fAlphaScale <= 256); |
319 | |
320 | int y0, y1, wy; |
321 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
322 | |
323 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
324 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
325 | |
326 | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
327 | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
328 | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. |
329 | _mm_set1_epi16(16-wy)); // Top pixel goes here. |
330 | |
331 | while (count --> 0) { |
332 | int x0, x1, wx; |
333 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
334 | |
335 | // Load the 4 pixels we're interpolating, in this grid: |
336 | // | tl tr | |
337 | // | bl br | |
338 | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), |
339 | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); |
340 | |
341 | // We want to calculate a sum of 4 pixels weighted in two directions: |
342 | // |
343 | // sum = tl * (16-wy) * (16-wx) |
344 | // + bl * ( wy) * (16-wx) |
345 | // + tr * (16-wy) * ( wx) |
346 | // + br * ( wy) * ( wx) |
347 | // |
348 | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
349 | // |
350 | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
351 | // to apply those y-direction weights. So we'll start on the x-direction |
352 | // first, grouping into left and right halves, lined up with allY: |
353 | // |
354 | // L = [bl, tl] |
355 | // R = [br, tr] |
356 | // |
357 | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
358 | // |
359 | // Rewriting that one more step, we can replace a multiply with a shift: |
360 | // |
361 | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
362 | // |
363 | // That's how we'll actually do this math. |
364 | |
365 | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), |
366 | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); |
367 | |
368 | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), |
369 | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); |
370 | |
371 | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); |
372 | |
373 | // sum = horizontalSum( ... ) |
374 | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); |
375 | |
376 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
377 | sum = _mm_srli_epi16(sum, 8); |
378 | |
379 | if (s.fAlphaScale < 256) { |
380 | // Scale by alpha, which is in [0,256]. |
381 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
382 | sum = _mm_srli_epi16(sum, 8); |
383 | } |
384 | |
385 | // Pack back into 8-bit values and store. |
386 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
387 | } |
388 | } |
389 | |
390 | #else |
391 | |
392 | // The NEON code only actually differs from the portable code in the |
393 | // filtering step after we've loaded all four pixels we want to bilerp. |
394 | |
395 | #if defined(SK_ARM_HAS_NEON) |
396 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
397 | SkPMColor a00, SkPMColor a01, |
398 | SkPMColor a10, SkPMColor a11, |
399 | SkPMColor *dst, |
400 | uint16_t scale) { |
401 | uint8x8_t vy, vconst16_8, v16_y, vres; |
402 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
403 | uint32x2_t va0, va1; |
404 | uint16x8_t tmp1, tmp2; |
405 | |
406 | vy = vdup_n_u8(y); // duplicate y into vy |
407 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
408 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
409 | |
410 | va0 = vdup_n_u32(a00); // duplicate a00 |
411 | va1 = vdup_n_u32(a10); // duplicate a10 |
412 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
413 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
414 | |
415 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
416 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
417 | |
418 | vx = vdup_n_u16(x); // duplicate x into vx |
419 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
420 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
421 | |
422 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
423 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
424 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
425 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
426 | |
427 | if (scale < 256) { |
428 | vscale = vdup_n_u16(scale); // duplicate scale |
429 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
430 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
431 | } |
432 | |
433 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8 |
434 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
435 | } |
436 | #else |
437 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
438 | SkPMColor a00, SkPMColor a01, |
439 | SkPMColor a10, SkPMColor a11, |
440 | SkPMColor* dstColor, |
441 | unsigned alphaScale) { |
442 | SkASSERT((unsigned)x <= 0xF); |
443 | SkASSERT((unsigned)y <= 0xF); |
444 | SkASSERT(alphaScale <= 256); |
445 | |
446 | int xy = x * y; |
447 | const uint32_t mask = 0xFF00FF; |
448 | |
449 | int scale = 256 - 16*y - 16*x + xy; |
450 | uint32_t lo = (a00 & mask) * scale; |
451 | uint32_t hi = ((a00 >> 8) & mask) * scale; |
452 | |
453 | scale = 16*x - xy; |
454 | lo += (a01 & mask) * scale; |
455 | hi += ((a01 >> 8) & mask) * scale; |
456 | |
457 | scale = 16*y - xy; |
458 | lo += (a10 & mask) * scale; |
459 | hi += ((a10 >> 8) & mask) * scale; |
460 | |
461 | lo += (a11 & mask) * xy; |
462 | hi += ((a11 >> 8) & mask) * xy; |
463 | |
464 | if (alphaScale < 256) { |
465 | lo = ((lo >> 8) & mask) * alphaScale; |
466 | hi = ((hi >> 8) & mask) * alphaScale; |
467 | } |
468 | |
469 | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
470 | } |
471 | #endif |
472 | |
473 | |
474 | /*not static*/ inline |
475 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
476 | const uint32_t* xy, int count, SkPMColor* colors) { |
477 | SkASSERT(count > 0 && colors != nullptr); |
478 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
479 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
480 | SkASSERT(s.fAlphaScale <= 256); |
481 | |
482 | int y0, y1, wy; |
483 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
484 | |
485 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
486 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
487 | |
488 | while (count --> 0) { |
489 | int x0, x1, wx; |
490 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
491 | |
492 | filter_and_scale_by_alpha(wx, wy, |
493 | row0[x0], row0[x1], |
494 | row1[x0], row1[x1], |
495 | colors++, |
496 | s.fAlphaScale); |
497 | } |
498 | } |
499 | |
500 | #endif |
501 | |
502 | #if defined(SK_ARM_HAS_NEON) |
503 | /*not static*/ inline |
504 | void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s, |
505 | const uint32_t* xy, int count, SkPMColor* colors) { |
506 | SkASSERT(count > 0 && colors != nullptr); |
507 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
508 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
509 | SkASSERT(s.fAlphaScale <= 256); |
510 | |
511 | auto src = (const char*)s.fPixmap.addr(); |
512 | size_t rb = s.fPixmap.rowBytes(); |
513 | |
514 | while (count --> 0) { |
515 | int y0, y1, wy, |
516 | x0, x1, wx; |
517 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
518 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
519 | |
520 | auto row0 = (const uint32_t*)(src + y0*rb), |
521 | row1 = (const uint32_t*)(src + y1*rb); |
522 | |
523 | filter_and_scale_by_alpha(wx, wy, |
524 | row0[x0], row0[x1], |
525 | row1[x0], row1[x1], |
526 | colors++, |
527 | s.fAlphaScale); |
528 | } |
529 | } |
530 | #else |
531 | // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2. |
532 | constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&, |
533 | const uint32_t*, int, SkPMColor*) = nullptr; |
534 | #endif |
535 | |
536 | } // namespace SK_OPTS_NS |
537 | |
538 | #endif |
539 | |