1 | /* |
2 | * Copyright 2006 The Android Open Source Project |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #include "include/core/SkShader.h" |
9 | #include "include/private/SkColorData.h" |
10 | #include "include/private/SkVx.h" |
11 | #include "src/core/SkCoreBlitters.h" |
12 | #include "src/core/SkUtils.h" |
13 | #include "src/core/SkXfermodePriv.h" |
14 | |
15 | static inline int upscale_31_to_32(int value) { |
16 | SkASSERT((unsigned)value <= 31); |
17 | return value + (value >> 4); |
18 | } |
19 | |
20 | static inline int blend_32(int src, int dst, int scale) { |
21 | SkASSERT((unsigned)src <= 0xFF); |
22 | SkASSERT((unsigned)dst <= 0xFF); |
23 | SkASSERT((unsigned)scale <= 32); |
24 | return dst + ((src - dst) * scale >> 5); |
25 | } |
26 | |
27 | static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB, |
28 | SkPMColor dst, uint16_t mask) { |
29 | if (mask == 0) { |
30 | return dst; |
31 | } |
32 | |
33 | /* We want all of these in 5bits, hence the shifts in case one of them |
34 | * (green) is 6bits. |
35 | */ |
36 | int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); |
37 | int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); |
38 | int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5); |
39 | |
40 | // Now upscale them to 0..32, so we can use blend32 |
41 | maskR = upscale_31_to_32(maskR); |
42 | maskG = upscale_31_to_32(maskG); |
43 | maskB = upscale_31_to_32(maskB); |
44 | |
45 | // srcA has been upscaled to 256 before passed into this function |
46 | maskR = maskR * srcA >> 8; |
47 | maskG = maskG * srcA >> 8; |
48 | maskB = maskB * srcA >> 8; |
49 | |
50 | int dstR = SkGetPackedR32(dst); |
51 | int dstG = SkGetPackedG32(dst); |
52 | int dstB = SkGetPackedB32(dst); |
53 | |
54 | // LCD blitting is only supported if the dst is known/required |
55 | // to be opaque |
56 | return SkPackARGB32(0xFF, |
57 | blend_32(srcR, dstR, maskR), |
58 | blend_32(srcG, dstG, maskG), |
59 | blend_32(srcB, dstB, maskB)); |
60 | } |
61 | |
62 | static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB, |
63 | SkPMColor dst, uint16_t mask, |
64 | SkPMColor opaqueDst) { |
65 | if (mask == 0) { |
66 | return dst; |
67 | } |
68 | |
69 | if (0xFFFF == mask) { |
70 | return opaqueDst; |
71 | } |
72 | |
73 | /* We want all of these in 5bits, hence the shifts in case one of them |
74 | * (green) is 6bits. |
75 | */ |
76 | int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); |
77 | int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); |
78 | int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5); |
79 | |
80 | // Now upscale them to 0..32, so we can use blend32 |
81 | maskR = upscale_31_to_32(maskR); |
82 | maskG = upscale_31_to_32(maskG); |
83 | maskB = upscale_31_to_32(maskB); |
84 | |
85 | int dstR = SkGetPackedR32(dst); |
86 | int dstG = SkGetPackedG32(dst); |
87 | int dstB = SkGetPackedB32(dst); |
88 | |
89 | // LCD blitting is only supported if the dst is known/required |
90 | // to be opaque |
91 | return SkPackARGB32(0xFF, |
92 | blend_32(srcR, dstR, maskR), |
93 | blend_32(srcG, dstG, maskG), |
94 | blend_32(srcB, dstB, maskB)); |
95 | } |
96 | |
97 | |
98 | // TODO: rewrite at least the SSE code here. It's miserable. |
99 | |
100 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
101 | #include <emmintrin.h> |
102 | |
103 | // The following (left) shifts cause the top 5 bits of the mask components to |
104 | // line up with the corresponding components in an SkPMColor. |
105 | // Note that the mask's RGB16 order may differ from the SkPMColor order. |
106 | #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) |
107 | #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) |
108 | #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) |
109 | |
110 | #if SK_R16x5_R32x5_SHIFT == 0 |
111 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) |
112 | #elif SK_R16x5_R32x5_SHIFT > 0 |
113 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) |
114 | #else |
115 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) |
116 | #endif |
117 | |
118 | #if SK_G16x5_G32x5_SHIFT == 0 |
119 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) |
120 | #elif SK_G16x5_G32x5_SHIFT > 0 |
121 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) |
122 | #else |
123 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) |
124 | #endif |
125 | |
126 | #if SK_B16x5_B32x5_SHIFT == 0 |
127 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
128 | #elif SK_B16x5_B32x5_SHIFT > 0 |
129 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) |
130 | #else |
131 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) |
132 | #endif |
133 | |
134 | static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) { |
135 | // In the following comments, the components of src, dst and mask are |
136 | // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
137 | // by an R, G, B, or A suffix. Components of one of the four pixels that |
138 | // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
139 | // example is the blue channel of the second destination pixel. Memory |
140 | // layout is shown for an ARGB byte order in a color value. |
141 | |
142 | // src and srcA store 8-bit values interleaved with zeros. |
143 | // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
144 | // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, |
145 | // srcA, 0, srcA, 0, srcA, 0, srcA, 0) |
146 | // mask stores 16-bit values (compressed three channels) interleaved with zeros. |
147 | // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. |
148 | // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
149 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
150 | |
151 | // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
152 | // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
153 | __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
154 | _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
155 | |
156 | // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
157 | __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
158 | _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
159 | |
160 | // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
161 | __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
162 | _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
163 | |
164 | // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
165 | // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
166 | // 8-bit position |
167 | // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
168 | // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
169 | mask = _mm_or_si128(_mm_or_si128(r, g), b); |
170 | |
171 | // Interleave R,G,B into the lower byte of word. |
172 | // i.e. split the sixteen 8-bit values from mask into two sets of eight |
173 | // 16-bit values, padded by zero. |
174 | __m128i maskLo, maskHi; |
175 | // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
176 | maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
177 | // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
178 | maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
179 | |
180 | // Upscale from 0..31 to 0..32 |
181 | // (allows to replace division by left-shift further down) |
182 | // Left-shift each component by 4 and add the result back to that component, |
183 | // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
184 | maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
185 | maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
186 | |
187 | // Multiply each component of maskLo and maskHi by srcA |
188 | maskLo = _mm_mullo_epi16(maskLo, srcA); |
189 | maskHi = _mm_mullo_epi16(maskHi, srcA); |
190 | |
191 | // Left shift mask components by 8 (divide by 256) |
192 | maskLo = _mm_srli_epi16(maskLo, 8); |
193 | maskHi = _mm_srli_epi16(maskHi, 8); |
194 | |
195 | // Interleave R,G,B into the lower byte of the word |
196 | // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
197 | __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
198 | // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
199 | __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
200 | |
201 | // mask = (src - dst) * mask |
202 | maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
203 | maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
204 | |
205 | // mask = (src - dst) * mask >> 5 |
206 | maskLo = _mm_srai_epi16(maskLo, 5); |
207 | maskHi = _mm_srai_epi16(maskHi, 5); |
208 | |
209 | // Add two pixels into result. |
210 | // result = dst + ((src - dst) * mask >> 5) |
211 | __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
212 | __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
213 | |
214 | // Pack into 4 32bit dst pixels. |
215 | // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
216 | // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
217 | // clamping to 255 if necessary. |
218 | return _mm_packus_epi16(resultLo, resultHi); |
219 | } |
220 | |
221 | static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) { |
222 | // In the following comments, the components of src, dst and mask are |
223 | // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
224 | // by an R, G, B, or A suffix. Components of one of the four pixels that |
225 | // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
226 | // example is the blue channel of the second destination pixel. Memory |
227 | // layout is shown for an ARGB byte order in a color value. |
228 | |
229 | // src and srcA store 8-bit values interleaved with zeros. |
230 | // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
231 | // mask stores 16-bit values (shown as high and low bytes) interleaved with |
232 | // zeros |
233 | // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
234 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
235 | |
236 | // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
237 | // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
238 | __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
239 | _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
240 | |
241 | // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
242 | __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
243 | _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
244 | |
245 | // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
246 | __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
247 | _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
248 | |
249 | // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
250 | // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
251 | // 8-bit position |
252 | // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
253 | // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
254 | mask = _mm_or_si128(_mm_or_si128(r, g), b); |
255 | |
256 | // Interleave R,G,B into the lower byte of word. |
257 | // i.e. split the sixteen 8-bit values from mask into two sets of eight |
258 | // 16-bit values, padded by zero. |
259 | __m128i maskLo, maskHi; |
260 | // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
261 | maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
262 | // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
263 | maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
264 | |
265 | // Upscale from 0..31 to 0..32 |
266 | // (allows to replace division by left-shift further down) |
267 | // Left-shift each component by 4 and add the result back to that component, |
268 | // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
269 | maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
270 | maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
271 | |
272 | // Interleave R,G,B into the lower byte of the word |
273 | // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
274 | __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
275 | // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
276 | __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
277 | |
278 | // mask = (src - dst) * mask |
279 | maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
280 | maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
281 | |
282 | // mask = (src - dst) * mask >> 5 |
283 | maskLo = _mm_srai_epi16(maskLo, 5); |
284 | maskHi = _mm_srai_epi16(maskHi, 5); |
285 | |
286 | // Add two pixels into result. |
287 | // result = dst + ((src - dst) * mask >> 5) |
288 | __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
289 | __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
290 | |
291 | // Pack into 4 32bit dst pixels and force opaque. |
292 | // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
293 | // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
294 | // clamping to 255 if necessary. Set alpha components to 0xFF. |
295 | return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
296 | _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
297 | } |
298 | |
299 | void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) { |
300 | if (width <= 0) { |
301 | return; |
302 | } |
303 | |
304 | int srcA = SkColorGetA(src); |
305 | int srcR = SkColorGetR(src); |
306 | int srcG = SkColorGetG(src); |
307 | int srcB = SkColorGetB(src); |
308 | |
309 | srcA = SkAlpha255To256(srcA); |
310 | |
311 | if (width >= 4) { |
312 | SkASSERT(((size_t)dst & 0x03) == 0); |
313 | while (((size_t)dst & 0x0F) != 0) { |
314 | *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask); |
315 | mask++; |
316 | dst++; |
317 | width--; |
318 | } |
319 | |
320 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
321 | // Set alpha to 0xFF and replicate source four times in SSE register. |
322 | __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
323 | // Interleave with zeros to get two sets of four 16-bit values. |
324 | src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
325 | // Set srcA_sse to contain eight copies of srcA, padded with zero. |
326 | // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
327 | __m128i srcA_sse = _mm_set1_epi16(srcA); |
328 | while (width >= 4) { |
329 | // Load four destination pixels into dst_sse. |
330 | __m128i dst_sse = _mm_load_si128(d); |
331 | // Load four 16-bit masks into lower half of mask_sse. |
332 | __m128i mask_sse = _mm_loadl_epi64( |
333 | reinterpret_cast<const __m128i*>(mask)); |
334 | |
335 | // Check whether masks are equal to 0 and get the highest bit |
336 | // of each byte of result, if masks are all zero, we will get |
337 | // pack_cmp to 0xFFFF |
338 | int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
339 | _mm_setzero_si128())); |
340 | |
341 | // if mask pixels are not all zero, we will blend the dst pixels |
342 | if (pack_cmp != 0xFFFF) { |
343 | // Unpack 4 16bit mask pixels to |
344 | // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
345 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
346 | mask_sse = _mm_unpacklo_epi16(mask_sse, |
347 | _mm_setzero_si128()); |
348 | |
349 | // Process 4 32bit dst pixels |
350 | __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse); |
351 | _mm_store_si128(d, result); |
352 | } |
353 | |
354 | d++; |
355 | mask += 4; |
356 | width -= 4; |
357 | } |
358 | |
359 | dst = reinterpret_cast<SkPMColor*>(d); |
360 | } |
361 | |
362 | while (width > 0) { |
363 | *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask); |
364 | mask++; |
365 | dst++; |
366 | width--; |
367 | } |
368 | } |
369 | |
370 | void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[], |
371 | SkColor src, int width, SkPMColor opaqueDst) { |
372 | if (width <= 0) { |
373 | return; |
374 | } |
375 | |
376 | int srcR = SkColorGetR(src); |
377 | int srcG = SkColorGetG(src); |
378 | int srcB = SkColorGetB(src); |
379 | |
380 | if (width >= 4) { |
381 | SkASSERT(((size_t)dst & 0x03) == 0); |
382 | while (((size_t)dst & 0x0F) != 0) { |
383 | *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
384 | mask++; |
385 | dst++; |
386 | width--; |
387 | } |
388 | |
389 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
390 | // Set alpha to 0xFF and replicate source four times in SSE register. |
391 | __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
392 | // Set srcA_sse to contain eight copies of srcA, padded with zero. |
393 | // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
394 | src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
395 | while (width >= 4) { |
396 | // Load four destination pixels into dst_sse. |
397 | __m128i dst_sse = _mm_load_si128(d); |
398 | // Load four 16-bit masks into lower half of mask_sse. |
399 | __m128i mask_sse = _mm_loadl_epi64( |
400 | reinterpret_cast<const __m128i*>(mask)); |
401 | |
402 | // Check whether masks are equal to 0 and get the highest bit |
403 | // of each byte of result, if masks are all zero, we will get |
404 | // pack_cmp to 0xFFFF |
405 | int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
406 | _mm_setzero_si128())); |
407 | |
408 | // if mask pixels are not all zero, we will blend the dst pixels |
409 | if (pack_cmp != 0xFFFF) { |
410 | // Unpack 4 16bit mask pixels to |
411 | // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
412 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
413 | mask_sse = _mm_unpacklo_epi16(mask_sse, |
414 | _mm_setzero_si128()); |
415 | |
416 | // Process 4 32bit dst pixels |
417 | __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse); |
418 | _mm_store_si128(d, result); |
419 | } |
420 | |
421 | d++; |
422 | mask += 4; |
423 | width -= 4; |
424 | } |
425 | |
426 | dst = reinterpret_cast<SkPMColor*>(d); |
427 | } |
428 | |
429 | while (width > 0) { |
430 | *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
431 | mask++; |
432 | dst++; |
433 | width--; |
434 | } |
435 | } |
436 | |
437 | #elif defined(SK_ARM_HAS_NEON) |
438 | #include <arm_neon.h> |
439 | |
440 | #define NEON_A (SK_A32_SHIFT / 8) |
441 | #define NEON_R (SK_R32_SHIFT / 8) |
442 | #define NEON_G (SK_G32_SHIFT / 8) |
443 | #define NEON_B (SK_B32_SHIFT / 8) |
444 | |
445 | static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) { |
446 | int16x8_t src_wide, dst_wide; |
447 | |
448 | src_wide = vreinterpretq_s16_u16(vmovl_u8(src)); |
449 | dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst)); |
450 | |
451 | src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale); |
452 | |
453 | dst_wide += vshrq_n_s16(src_wide, 5); |
454 | |
455 | return vmovn_u16(vreinterpretq_u16_s16(dst_wide)); |
456 | } |
457 | |
458 | void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[], |
459 | SkColor color, int width, |
460 | SkPMColor opaqueDst) { |
461 | int colR = SkColorGetR(color); |
462 | int colG = SkColorGetG(color); |
463 | int colB = SkColorGetB(color); |
464 | |
465 | uint8x8_t vcolR = vdup_n_u8(colR); |
466 | uint8x8_t vcolG = vdup_n_u8(colG); |
467 | uint8x8_t vcolB = vdup_n_u8(colB); |
468 | uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); |
469 | uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); |
470 | uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); |
471 | uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); |
472 | |
473 | while (width >= 8) { |
474 | uint8x8x4_t vdst; |
475 | uint16x8_t vmask; |
476 | uint16x8_t vmaskR, vmaskG, vmaskB; |
477 | uint8x8_t vsel_trans, vsel_opq; |
478 | |
479 | vdst = vld4_u8((uint8_t*)dst); |
480 | vmask = vld1q_u16(src); |
481 | |
482 | // Prepare compare masks |
483 | vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); |
484 | vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); |
485 | |
486 | // Get all the color masks on 5 bits |
487 | vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
488 | vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
489 | SK_B16_BITS + SK_R16_BITS + 1); |
490 | vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
491 | |
492 | // Upscale to 0..32 |
493 | vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
494 | vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
495 | vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
496 | |
497 | vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); |
498 | vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); |
499 | |
500 | vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR); |
501 | vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG); |
502 | vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB); |
503 | |
504 | vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); |
505 | vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); |
506 | vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); |
507 | |
508 | vst4_u8((uint8_t*)dst, vdst); |
509 | |
510 | dst += 8; |
511 | src += 8; |
512 | width -= 8; |
513 | } |
514 | |
515 | // Leftovers |
516 | for (int i = 0; i < width; i++) { |
517 | dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst); |
518 | } |
519 | } |
520 | |
521 | void blit_row_lcd16(SkPMColor dst[], const uint16_t src[], |
522 | SkColor color, int width, SkPMColor) { |
523 | int colA = SkColorGetA(color); |
524 | int colR = SkColorGetR(color); |
525 | int colG = SkColorGetG(color); |
526 | int colB = SkColorGetB(color); |
527 | |
528 | colA = SkAlpha255To256(colA); |
529 | |
530 | uint16x8_t vcolA = vdupq_n_u16(colA); |
531 | uint8x8_t vcolR = vdup_n_u8(colR); |
532 | uint8x8_t vcolG = vdup_n_u8(colG); |
533 | uint8x8_t vcolB = vdup_n_u8(colB); |
534 | |
535 | while (width >= 8) { |
536 | uint8x8x4_t vdst; |
537 | uint16x8_t vmask; |
538 | uint16x8_t vmaskR, vmaskG, vmaskB; |
539 | |
540 | vdst = vld4_u8((uint8_t*)dst); |
541 | vmask = vld1q_u16(src); |
542 | |
543 | // Get all the color masks on 5 bits |
544 | vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
545 | vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
546 | SK_B16_BITS + SK_R16_BITS + 1); |
547 | vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
548 | |
549 | // Upscale to 0..32 |
550 | vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
551 | vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
552 | vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
553 | |
554 | vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); |
555 | vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); |
556 | vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); |
557 | |
558 | vdst.val[NEON_A] = vdup_n_u8(0xFF); |
559 | vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR); |
560 | vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG); |
561 | vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB); |
562 | |
563 | vst4_u8((uint8_t*)dst, vdst); |
564 | |
565 | dst += 8; |
566 | src += 8; |
567 | width -= 8; |
568 | } |
569 | |
570 | for (int i = 0; i < width; i++) { |
571 | dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]); |
572 | } |
573 | } |
574 | |
575 | #else |
576 | |
577 | static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], |
578 | SkColor src, int width, SkPMColor) { |
579 | int srcA = SkColorGetA(src); |
580 | int srcR = SkColorGetR(src); |
581 | int srcG = SkColorGetG(src); |
582 | int srcB = SkColorGetB(src); |
583 | |
584 | srcA = SkAlpha255To256(srcA); |
585 | |
586 | for (int i = 0; i < width; i++) { |
587 | dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]); |
588 | } |
589 | } |
590 | |
591 | static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[], |
592 | SkColor src, int width, |
593 | SkPMColor opaqueDst) { |
594 | int srcR = SkColorGetR(src); |
595 | int srcG = SkColorGetG(src); |
596 | int srcB = SkColorGetB(src); |
597 | |
598 | for (int i = 0; i < width; i++) { |
599 | dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst); |
600 | } |
601 | } |
602 | |
603 | #endif |
604 | |
605 | static bool blit_color(const SkPixmap& device, |
606 | const SkMask& mask, |
607 | const SkIRect& clip, |
608 | SkColor color) { |
609 | int x = clip.fLeft, |
610 | y = clip.fTop; |
611 | |
612 | if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) { |
613 | SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(), |
614 | (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes, |
615 | color, clip.width(), clip.height()); |
616 | return true; |
617 | } |
618 | |
619 | if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) { |
620 | auto dstRow = device.writable_addr32(x,y); |
621 | auto maskRow = (const uint16_t*)mask.getAddr(x,y); |
622 | |
623 | auto blit_row = blit_row_lcd16; |
624 | SkPMColor opaqueDst = 0; // ignored unless opaque |
625 | |
626 | if (0xff == SkColorGetA(color)) { |
627 | blit_row = blit_row_lcd16_opaque; |
628 | opaqueDst = SkPreMultiplyColor(color); |
629 | } |
630 | |
631 | for (int height = clip.height(); height --> 0; ) { |
632 | blit_row(dstRow, maskRow, color, clip.width(), opaqueDst); |
633 | |
634 | dstRow = (SkPMColor*) (( char*) dstRow + device.rowBytes()); |
635 | maskRow = (const uint16_t*)((const char*)maskRow + mask.fRowBytes); |
636 | } |
637 | return true; |
638 | } |
639 | |
640 | return false; |
641 | } |
642 | |
643 | /////////////////////////////////////////////////////////////////////////////// |
644 | |
645 | static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask, |
646 | const SkIRect& clip, SkPMColor srcColor) { |
647 | U8CPU alpha = SkGetPackedA32(srcColor); |
648 | unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32; |
649 | if (alpha != 255) { |
650 | flags |= SkBlitRow::kGlobalAlpha_Flag32; |
651 | } |
652 | SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags); |
653 | |
654 | int x = clip.fLeft; |
655 | int y = clip.fTop; |
656 | int width = clip.width(); |
657 | int height = clip.height(); |
658 | |
659 | SkPMColor* dstRow = device.writable_addr32(x, y); |
660 | const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y)); |
661 | |
662 | do { |
663 | proc(dstRow, srcRow, width, alpha); |
664 | dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes()); |
665 | srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes); |
666 | } while (--height != 0); |
667 | } |
668 | |
669 | ////////////////////////////////////////////////////////////////////////////////////// |
670 | |
671 | SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint) |
672 | : INHERITED(device) { |
673 | SkColor color = paint.getColor(); |
674 | fColor = color; |
675 | |
676 | fSrcA = SkColorGetA(color); |
677 | unsigned scale = SkAlpha255To256(fSrcA); |
678 | fSrcR = SkAlphaMul(SkColorGetR(color), scale); |
679 | fSrcG = SkAlphaMul(SkColorGetG(color), scale); |
680 | fSrcB = SkAlphaMul(SkColorGetB(color), scale); |
681 | |
682 | fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB); |
683 | } |
684 | |
685 | const SkPixmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) { |
686 | if (255 == fSrcA) { |
687 | *value = fPMColor; |
688 | return &fDevice; |
689 | } |
690 | return nullptr; |
691 | } |
692 | |
693 | #if defined _WIN32 // disable warning : local variable used without having been initialized |
694 | #pragma warning ( push ) |
695 | #pragma warning ( disable : 4701 ) |
696 | #endif |
697 | |
698 | void SkARGB32_Blitter::blitH(int x, int y, int width) { |
699 | SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width()); |
700 | |
701 | uint32_t* device = fDevice.writable_addr32(x, y); |
702 | SkBlitRow::Color32(device, device, width, fPMColor); |
703 | } |
704 | |
705 | void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[], |
706 | const int16_t runs[]) { |
707 | if (fSrcA == 0) { |
708 | return; |
709 | } |
710 | |
711 | uint32_t color = fPMColor; |
712 | uint32_t* device = fDevice.writable_addr32(x, y); |
713 | unsigned opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case |
714 | |
715 | for (;;) { |
716 | int count = runs[0]; |
717 | SkASSERT(count >= 0); |
718 | if (count <= 0) { |
719 | return; |
720 | } |
721 | unsigned aa = antialias[0]; |
722 | if (aa) { |
723 | if ((opaqueMask & aa) == 255) { |
724 | sk_memset32(device, color, count); |
725 | } else { |
726 | uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa)); |
727 | SkBlitRow::Color32(device, device, count, sc); |
728 | } |
729 | } |
730 | runs += count; |
731 | antialias += count; |
732 | device += count; |
733 | } |
734 | } |
735 | |
736 | void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) { |
737 | uint32_t* device = fDevice.writable_addr32(x, y); |
738 | SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);) |
739 | |
740 | device[0] = SkBlendARGB32(fPMColor, device[0], a0); |
741 | device[1] = SkBlendARGB32(fPMColor, device[1], a1); |
742 | } |
743 | |
744 | void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) { |
745 | uint32_t* device = fDevice.writable_addr32(x, y); |
746 | SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);) |
747 | |
748 | device[0] = SkBlendARGB32(fPMColor, device[0], a0); |
749 | device = (uint32_t*)((char*)device + fDevice.rowBytes()); |
750 | device[0] = SkBlendARGB32(fPMColor, device[0], a1); |
751 | } |
752 | |
753 | ////////////////////////////////////////////////////////////////////////////////////// |
754 | |
755 | #define solid_8_pixels(mask, dst, color) \ |
756 | do { \ |
757 | if (mask & 0x80) dst[0] = color; \ |
758 | if (mask & 0x40) dst[1] = color; \ |
759 | if (mask & 0x20) dst[2] = color; \ |
760 | if (mask & 0x10) dst[3] = color; \ |
761 | if (mask & 0x08) dst[4] = color; \ |
762 | if (mask & 0x04) dst[5] = color; \ |
763 | if (mask & 0x02) dst[6] = color; \ |
764 | if (mask & 0x01) dst[7] = color; \ |
765 | } while (0) |
766 | |
767 | #define SK_BLITBWMASK_NAME SkARGB32_BlitBW |
768 | #define SK_BLITBWMASK_ARGS , SkPMColor color |
769 | #define SK_BLITBWMASK_BLIT8(mask, dst) solid_8_pixels(mask, dst, color) |
770 | #define SK_BLITBWMASK_GETADDR writable_addr32 |
771 | #define SK_BLITBWMASK_DEVTYPE uint32_t |
772 | #include "src/core/SkBlitBWMaskTemplate.h" |
773 | |
774 | #define blend_8_pixels(mask, dst, sc, dst_scale) \ |
775 | do { \ |
776 | if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); } \ |
777 | if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); } \ |
778 | if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); } \ |
779 | if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); } \ |
780 | if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); } \ |
781 | if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); } \ |
782 | if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); } \ |
783 | if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); } \ |
784 | } while (0) |
785 | |
786 | #define SK_BLITBWMASK_NAME SkARGB32_BlendBW |
787 | #define SK_BLITBWMASK_ARGS , uint32_t sc, unsigned dst_scale |
788 | #define SK_BLITBWMASK_BLIT8(mask, dst) blend_8_pixels(mask, dst, sc, dst_scale) |
789 | #define SK_BLITBWMASK_GETADDR writable_addr32 |
790 | #define SK_BLITBWMASK_DEVTYPE uint32_t |
791 | #include "src/core/SkBlitBWMaskTemplate.h" |
792 | |
793 | void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) { |
794 | SkASSERT(mask.fBounds.contains(clip)); |
795 | SkASSERT(fSrcA != 0xFF); |
796 | |
797 | if (fSrcA == 0) { |
798 | return; |
799 | } |
800 | |
801 | if (blit_color(fDevice, mask, clip, fColor)) { |
802 | return; |
803 | } |
804 | |
805 | switch (mask.fFormat) { |
806 | case SkMask::kBW_Format: |
807 | SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA)); |
808 | break; |
809 | case SkMask::kARGB32_Format: |
810 | SkARGB32_Blit32(fDevice, mask, clip, fPMColor); |
811 | break; |
812 | default: |
813 | SK_ABORT("Mask format not handled." ); |
814 | } |
815 | } |
816 | |
817 | void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask, |
818 | const SkIRect& clip) { |
819 | SkASSERT(mask.fBounds.contains(clip)); |
820 | |
821 | if (blit_color(fDevice, mask, clip, fColor)) { |
822 | return; |
823 | } |
824 | |
825 | switch (mask.fFormat) { |
826 | case SkMask::kBW_Format: |
827 | SkARGB32_BlitBW(fDevice, mask, clip, fPMColor); |
828 | break; |
829 | case SkMask::kARGB32_Format: |
830 | SkARGB32_Blit32(fDevice, mask, clip, fPMColor); |
831 | break; |
832 | default: |
833 | SK_ABORT("Mask format not handled." ); |
834 | } |
835 | } |
836 | |
837 | void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) { |
838 | uint32_t* device = fDevice.writable_addr32(x, y); |
839 | SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);) |
840 | |
841 | device[0] = SkFastFourByteInterp(fPMColor, device[0], a0); |
842 | device[1] = SkFastFourByteInterp(fPMColor, device[1], a1); |
843 | } |
844 | |
845 | void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) { |
846 | uint32_t* device = fDevice.writable_addr32(x, y); |
847 | SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);) |
848 | |
849 | device[0] = SkFastFourByteInterp(fPMColor, device[0], a0); |
850 | device = (uint32_t*)((char*)device + fDevice.rowBytes()); |
851 | device[0] = SkFastFourByteInterp(fPMColor, device[0], a1); |
852 | } |
853 | |
854 | /////////////////////////////////////////////////////////////////////////////// |
855 | |
856 | void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) { |
857 | if (alpha == 0 || fSrcA == 0) { |
858 | return; |
859 | } |
860 | |
861 | uint32_t* device = fDevice.writable_addr32(x, y); |
862 | uint32_t color = fPMColor; |
863 | |
864 | if (alpha != 255) { |
865 | color = SkAlphaMulQ(color, SkAlpha255To256(alpha)); |
866 | } |
867 | |
868 | unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color)); |
869 | size_t rowBytes = fDevice.rowBytes(); |
870 | while (--height >= 0) { |
871 | device[0] = color + SkAlphaMulQ(device[0], dst_scale); |
872 | device = (uint32_t*)((char*)device + rowBytes); |
873 | } |
874 | } |
875 | |
876 | void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) { |
877 | SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height()); |
878 | |
879 | if (fSrcA == 0) { |
880 | return; |
881 | } |
882 | |
883 | uint32_t* device = fDevice.writable_addr32(x, y); |
884 | uint32_t color = fPMColor; |
885 | size_t rowBytes = fDevice.rowBytes(); |
886 | |
887 | if (SkGetPackedA32(fPMColor) == 0xFF) { |
888 | SkOpts::rect_memset32(device, color, width, rowBytes, height); |
889 | } else { |
890 | while (height --> 0) { |
891 | SkBlitRow::Color32(device, device, width, color); |
892 | device = (uint32_t*)((char*)device + rowBytes); |
893 | } |
894 | } |
895 | } |
896 | |
897 | #if defined _WIN32 |
898 | #pragma warning ( pop ) |
899 | #endif |
900 | |
901 | /////////////////////////////////////////////////////////////////////// |
902 | |
903 | void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[], |
904 | const int16_t runs[]) { |
905 | uint32_t* device = fDevice.writable_addr32(x, y); |
906 | SkPMColor black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT); |
907 | |
908 | for (;;) { |
909 | int count = runs[0]; |
910 | SkASSERT(count >= 0); |
911 | if (count <= 0) { |
912 | return; |
913 | } |
914 | unsigned aa = antialias[0]; |
915 | if (aa) { |
916 | if (aa == 255) { |
917 | sk_memset32(device, black, count); |
918 | } else { |
919 | SkPMColor src = aa << SK_A32_SHIFT; |
920 | unsigned dst_scale = 256 - aa; |
921 | int n = count; |
922 | do { |
923 | --n; |
924 | device[n] = src + SkAlphaMulQ(device[n], dst_scale); |
925 | } while (n > 0); |
926 | } |
927 | } |
928 | runs += count; |
929 | antialias += count; |
930 | device += count; |
931 | } |
932 | } |
933 | |
934 | void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) { |
935 | uint32_t* device = fDevice.writable_addr32(x, y); |
936 | SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);) |
937 | |
938 | device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0); |
939 | device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1); |
940 | } |
941 | |
942 | void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) { |
943 | uint32_t* device = fDevice.writable_addr32(x, y); |
944 | SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);) |
945 | |
946 | device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0); |
947 | device = (uint32_t*)((char*)device + fDevice.rowBytes()); |
948 | device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1); |
949 | } |
950 | |
951 | /////////////////////////////////////////////////////////////////////////////// |
952 | |
953 | // Special version of SkBlitRow::Factory32 that knows we're in kSrc_Mode, |
954 | // instead of kSrcOver_Mode |
955 | static void blend_srcmode(SkPMColor* SK_RESTRICT device, |
956 | const SkPMColor* SK_RESTRICT span, |
957 | int count, U8CPU aa) { |
958 | int aa256 = SkAlpha255To256(aa); |
959 | for (int i = 0; i < count; ++i) { |
960 | device[i] = SkFourByteInterp256(span[i], device[i], aa256); |
961 | } |
962 | } |
963 | |
964 | SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device, |
965 | const SkPaint& paint, SkShaderBase::Context* shaderContext) |
966 | : INHERITED(device, paint, shaderContext) |
967 | { |
968 | fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor))); |
969 | |
970 | fXfermode = SkXfermode::Peek(paint.getBlendMode()); |
971 | |
972 | int flags = 0; |
973 | if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) { |
974 | flags |= SkBlitRow::kSrcPixelAlpha_Flag32; |
975 | } |
976 | // we call this on the output from the shader |
977 | fProc32 = SkBlitRow::Factory32(flags); |
978 | // we call this on the output from the shader + alpha from the aa buffer |
979 | fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32); |
980 | |
981 | fShadeDirectlyIntoDevice = false; |
982 | if (fXfermode == nullptr) { |
983 | if (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag) { |
984 | fShadeDirectlyIntoDevice = true; |
985 | } |
986 | } else { |
987 | if (SkBlendMode::kSrc == paint.getBlendMode()) { |
988 | fShadeDirectlyIntoDevice = true; |
989 | fProc32Blend = blend_srcmode; |
990 | } |
991 | } |
992 | |
993 | fConstInY = SkToBool(shaderContext->getFlags() & SkShaderBase::kConstInY32_Flag); |
994 | } |
995 | |
996 | SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() { |
997 | sk_free(fBuffer); |
998 | } |
999 | |
1000 | void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) { |
1001 | SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width()); |
1002 | |
1003 | uint32_t* device = fDevice.writable_addr32(x, y); |
1004 | |
1005 | if (fShadeDirectlyIntoDevice) { |
1006 | fShaderContext->shadeSpan(x, y, device, width); |
1007 | } else { |
1008 | SkPMColor* span = fBuffer; |
1009 | fShaderContext->shadeSpan(x, y, span, width); |
1010 | if (fXfermode) { |
1011 | fXfermode->xfer32(device, span, width, nullptr); |
1012 | } else { |
1013 | fProc32(device, span, width, 255); |
1014 | } |
1015 | } |
1016 | } |
1017 | |
1018 | void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) { |
1019 | SkASSERT(x >= 0 && y >= 0 && |
1020 | x + width <= fDevice.width() && y + height <= fDevice.height()); |
1021 | |
1022 | uint32_t* device = fDevice.writable_addr32(x, y); |
1023 | size_t deviceRB = fDevice.rowBytes(); |
1024 | auto* shaderContext = fShaderContext; |
1025 | SkPMColor* span = fBuffer; |
1026 | |
1027 | if (fConstInY) { |
1028 | if (fShadeDirectlyIntoDevice) { |
1029 | // shade the first row directly into the device |
1030 | shaderContext->shadeSpan(x, y, device, width); |
1031 | span = device; |
1032 | while (--height > 0) { |
1033 | device = (uint32_t*)((char*)device + deviceRB); |
1034 | memcpy(device, span, width << 2); |
1035 | } |
1036 | } else { |
1037 | shaderContext->shadeSpan(x, y, span, width); |
1038 | SkXfermode* xfer = fXfermode; |
1039 | if (xfer) { |
1040 | do { |
1041 | xfer->xfer32(device, span, width, nullptr); |
1042 | y += 1; |
1043 | device = (uint32_t*)((char*)device + deviceRB); |
1044 | } while (--height > 0); |
1045 | } else { |
1046 | SkBlitRow::Proc32 proc = fProc32; |
1047 | do { |
1048 | proc(device, span, width, 255); |
1049 | y += 1; |
1050 | device = (uint32_t*)((char*)device + deviceRB); |
1051 | } while (--height > 0); |
1052 | } |
1053 | } |
1054 | return; |
1055 | } |
1056 | |
1057 | if (fShadeDirectlyIntoDevice) { |
1058 | do { |
1059 | shaderContext->shadeSpan(x, y, device, width); |
1060 | y += 1; |
1061 | device = (uint32_t*)((char*)device + deviceRB); |
1062 | } while (--height > 0); |
1063 | } else { |
1064 | SkXfermode* xfer = fXfermode; |
1065 | if (xfer) { |
1066 | do { |
1067 | shaderContext->shadeSpan(x, y, span, width); |
1068 | xfer->xfer32(device, span, width, nullptr); |
1069 | y += 1; |
1070 | device = (uint32_t*)((char*)device + deviceRB); |
1071 | } while (--height > 0); |
1072 | } else { |
1073 | SkBlitRow::Proc32 proc = fProc32; |
1074 | do { |
1075 | shaderContext->shadeSpan(x, y, span, width); |
1076 | proc(device, span, width, 255); |
1077 | y += 1; |
1078 | device = (uint32_t*)((char*)device + deviceRB); |
1079 | } while (--height > 0); |
1080 | } |
1081 | } |
1082 | } |
1083 | |
1084 | void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[], |
1085 | const int16_t runs[]) { |
1086 | SkPMColor* span = fBuffer; |
1087 | uint32_t* device = fDevice.writable_addr32(x, y); |
1088 | auto* shaderContext = fShaderContext; |
1089 | |
1090 | if (fXfermode && !fShadeDirectlyIntoDevice) { |
1091 | for (;;) { |
1092 | SkXfermode* xfer = fXfermode; |
1093 | |
1094 | int count = *runs; |
1095 | if (count <= 0) |
1096 | break; |
1097 | int aa = *antialias; |
1098 | if (aa) { |
1099 | shaderContext->shadeSpan(x, y, span, count); |
1100 | if (aa == 255) { |
1101 | xfer->xfer32(device, span, count, nullptr); |
1102 | } else { |
1103 | // count is almost always 1 |
1104 | for (int i = count - 1; i >= 0; --i) { |
1105 | xfer->xfer32(&device[i], &span[i], 1, antialias); |
1106 | } |
1107 | } |
1108 | } |
1109 | device += count; |
1110 | runs += count; |
1111 | antialias += count; |
1112 | x += count; |
1113 | } |
1114 | } else if (fShadeDirectlyIntoDevice || |
1115 | (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) { |
1116 | for (;;) { |
1117 | int count = *runs; |
1118 | if (count <= 0) { |
1119 | break; |
1120 | } |
1121 | int aa = *antialias; |
1122 | if (aa) { |
1123 | if (aa == 255) { |
1124 | // cool, have the shader draw right into the device |
1125 | shaderContext->shadeSpan(x, y, device, count); |
1126 | } else { |
1127 | shaderContext->shadeSpan(x, y, span, count); |
1128 | fProc32Blend(device, span, count, aa); |
1129 | } |
1130 | } |
1131 | device += count; |
1132 | runs += count; |
1133 | antialias += count; |
1134 | x += count; |
1135 | } |
1136 | } else { |
1137 | for (;;) { |
1138 | int count = *runs; |
1139 | if (count <= 0) { |
1140 | break; |
1141 | } |
1142 | int aa = *antialias; |
1143 | if (aa) { |
1144 | shaderContext->shadeSpan(x, y, span, count); |
1145 | if (aa == 255) { |
1146 | fProc32(device, span, count, 255); |
1147 | } else { |
1148 | fProc32Blend(device, span, count, aa); |
1149 | } |
1150 | } |
1151 | device += count; |
1152 | runs += count; |
1153 | antialias += count; |
1154 | x += count; |
1155 | } |
1156 | } |
1157 | } |
1158 | |
1159 | using U32 = skvx::Vec< 4, uint32_t>; |
1160 | using U8x4 = skvx::Vec<16, uint8_t>; |
1161 | using U8 = skvx::Vec< 4, uint8_t>; |
1162 | |
1163 | static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n, |
1164 | U8x4 (*kernel)(U8x4,U8x4,U8x4)) { |
1165 | |
1166 | auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 { |
1167 | U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov); |
1168 | return skvx::bit_pun<U32>(kernel(skvx::bit_pun<U8x4>(dst), |
1169 | skvx::bit_pun<U8x4>(src), |
1170 | cov_splat)); |
1171 | }; |
1172 | while (n >= 4) { |
1173 | apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst); |
1174 | dst += 4; |
1175 | src += 4; |
1176 | cov += 4; |
1177 | n -= 4; |
1178 | } |
1179 | while (n --> 0) { |
1180 | *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0]; |
1181 | dst++; |
1182 | src++; |
1183 | cov++; |
1184 | } |
1185 | } |
1186 | |
1187 | static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) { |
1188 | auto cov = (const uint8_t*)mask; |
1189 | drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) { |
1190 | U8x4 s_aa = skvx::approx_scale(s, c), |
1191 | alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa); |
1192 | return s_aa + skvx::approx_scale(d, 255 - alpha); |
1193 | }); |
1194 | } |
1195 | |
1196 | static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) { |
1197 | auto cov = (const uint8_t*)mask; |
1198 | drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) { |
1199 | return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>( c ) |
1200 | + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c)); |
1201 | }); |
1202 | } |
1203 | |
1204 | static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) { |
1205 | auto src_alpha_blend = [](int s, int d, int sa, int m) { |
1206 | return d + SkAlphaMul(s - SkAlphaMul(sa, d), m); |
1207 | }; |
1208 | |
1209 | auto upscale_31_to_255 = [](int v) { |
1210 | return (v << 3) | (v >> 2); |
1211 | }; |
1212 | |
1213 | auto mask = (const uint16_t*)vmask; |
1214 | for (int i = 0; i < n; ++i) { |
1215 | uint16_t m = mask[i]; |
1216 | if (0 == m) { |
1217 | continue; |
1218 | } |
1219 | |
1220 | SkPMColor s = src[i]; |
1221 | SkPMColor d = dst[i]; |
1222 | |
1223 | int srcA = SkGetPackedA32(s); |
1224 | int srcR = SkGetPackedR32(s); |
1225 | int srcG = SkGetPackedG32(s); |
1226 | int srcB = SkGetPackedB32(s); |
1227 | |
1228 | srcA += srcA >> 7; |
1229 | |
1230 | // We're ignoring the least significant bit of the green coverage channel here. |
1231 | int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5); |
1232 | int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5); |
1233 | int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5); |
1234 | |
1235 | // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend(). |
1236 | maskR = upscale_31_to_255(maskR); |
1237 | maskG = upscale_31_to_255(maskG); |
1238 | maskB = upscale_31_to_255(maskB); |
1239 | |
1240 | // This LCD blit routine only works if the destination is opaque. |
1241 | dst[i] = SkPackARGB32(0xFF, |
1242 | src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR), |
1243 | src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG), |
1244 | src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB)); |
1245 | } |
1246 | } |
1247 | |
1248 | static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) { |
1249 | auto mask = (const uint16_t*)vmask; |
1250 | |
1251 | for (int i = 0; i < n; ++i) { |
1252 | uint16_t m = mask[i]; |
1253 | if (0 == m) { |
1254 | continue; |
1255 | } |
1256 | |
1257 | SkPMColor s = src[i]; |
1258 | SkPMColor d = dst[i]; |
1259 | |
1260 | int srcR = SkGetPackedR32(s); |
1261 | int srcG = SkGetPackedG32(s); |
1262 | int srcB = SkGetPackedB32(s); |
1263 | |
1264 | // We're ignoring the least significant bit of the green coverage channel here. |
1265 | int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5); |
1266 | int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5); |
1267 | int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5); |
1268 | |
1269 | // Now upscale them to 0..32, so we can use blend_32. |
1270 | maskR = upscale_31_to_32(maskR); |
1271 | maskG = upscale_31_to_32(maskG); |
1272 | maskB = upscale_31_to_32(maskB); |
1273 | |
1274 | // This LCD blit routine only works if the destination is opaque. |
1275 | dst[i] = SkPackARGB32(0xFF, |
1276 | blend_32(srcR, SkGetPackedR32(d), maskR), |
1277 | blend_32(srcG, SkGetPackedG32(d), maskG), |
1278 | blend_32(srcB, SkGetPackedB32(d), maskB)); |
1279 | } |
1280 | } |
1281 | |
1282 | void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) { |
1283 | // we only handle kA8 with an xfermode |
1284 | if (fXfermode && (SkMask::kA8_Format != mask.fFormat)) { |
1285 | this->INHERITED::blitMask(mask, clip); |
1286 | return; |
1287 | } |
1288 | |
1289 | SkASSERT(mask.fBounds.contains(clip)); |
1290 | |
1291 | void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr; |
1292 | |
1293 | if (!fXfermode) { |
1294 | bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag); |
1295 | |
1296 | if (mask.fFormat == SkMask::kA8_Format && opaque) { |
1297 | blend_row = blend_row_A8_opaque; |
1298 | } else if (mask.fFormat == SkMask::kA8_Format) { |
1299 | blend_row = blend_row_A8; |
1300 | } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) { |
1301 | blend_row = blend_row_LCD16_opaque; |
1302 | } else if (mask.fFormat == SkMask::kLCD16_Format) { |
1303 | blend_row = blend_row_lcd16; |
1304 | } else { |
1305 | this->INHERITED::blitMask(mask, clip); |
1306 | return; |
1307 | } |
1308 | } |
1309 | |
1310 | const int x = clip.fLeft; |
1311 | const int width = clip.width(); |
1312 | int y = clip.fTop; |
1313 | int height = clip.height(); |
1314 | |
1315 | char* dstRow = (char*)fDevice.writable_addr32(x, y); |
1316 | const size_t dstRB = fDevice.rowBytes(); |
1317 | const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y); |
1318 | const size_t maskRB = mask.fRowBytes; |
1319 | |
1320 | SkPMColor* span = fBuffer; |
1321 | |
1322 | if (fXfermode) { |
1323 | SkASSERT(SkMask::kA8_Format == mask.fFormat); |
1324 | SkXfermode* xfer = fXfermode; |
1325 | do { |
1326 | fShaderContext->shadeSpan(x, y, span, width); |
1327 | xfer->xfer32(reinterpret_cast<SkPMColor*>(dstRow), span, width, maskRow); |
1328 | dstRow += dstRB; |
1329 | maskRow += maskRB; |
1330 | y += 1; |
1331 | } while (--height > 0); |
1332 | } else { |
1333 | SkASSERT(blend_row); |
1334 | do { |
1335 | fShaderContext->shadeSpan(x, y, span, width); |
1336 | blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width); |
1337 | dstRow += dstRB; |
1338 | maskRow += maskRB; |
1339 | y += 1; |
1340 | } while (--height > 0); |
1341 | } |
1342 | } |
1343 | |
1344 | void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) { |
1345 | SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height()); |
1346 | |
1347 | uint32_t* device = fDevice.writable_addr32(x, y); |
1348 | size_t deviceRB = fDevice.rowBytes(); |
1349 | |
1350 | if (fConstInY) { |
1351 | SkPMColor c; |
1352 | fShaderContext->shadeSpan(x, y, &c, 1); |
1353 | |
1354 | if (fShadeDirectlyIntoDevice) { |
1355 | if (255 == alpha) { |
1356 | do { |
1357 | *device = c; |
1358 | device = (uint32_t*)((char*)device + deviceRB); |
1359 | } while (--height > 0); |
1360 | } else { |
1361 | do { |
1362 | *device = SkFourByteInterp(c, *device, alpha); |
1363 | device = (uint32_t*)((char*)device + deviceRB); |
1364 | } while (--height > 0); |
1365 | } |
1366 | } else { |
1367 | SkXfermode* xfer = fXfermode; |
1368 | if (xfer) { |
1369 | do { |
1370 | xfer->xfer32(device, &c, 1, &alpha); |
1371 | device = (uint32_t*)((char*)device + deviceRB); |
1372 | } while (--height > 0); |
1373 | } else { |
1374 | SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend; |
1375 | do { |
1376 | proc(device, &c, 1, alpha); |
1377 | device = (uint32_t*)((char*)device + deviceRB); |
1378 | } while (--height > 0); |
1379 | } |
1380 | } |
1381 | return; |
1382 | } |
1383 | |
1384 | if (fShadeDirectlyIntoDevice) { |
1385 | if (255 == alpha) { |
1386 | do { |
1387 | fShaderContext->shadeSpan(x, y, device, 1); |
1388 | y += 1; |
1389 | device = (uint32_t*)((char*)device + deviceRB); |
1390 | } while (--height > 0); |
1391 | } else { |
1392 | do { |
1393 | SkPMColor c; |
1394 | fShaderContext->shadeSpan(x, y, &c, 1); |
1395 | *device = SkFourByteInterp(c, *device, alpha); |
1396 | y += 1; |
1397 | device = (uint32_t*)((char*)device + deviceRB); |
1398 | } while (--height > 0); |
1399 | } |
1400 | } else { |
1401 | SkPMColor* span = fBuffer; |
1402 | SkXfermode* xfer = fXfermode; |
1403 | if (xfer) { |
1404 | do { |
1405 | fShaderContext->shadeSpan(x, y, span, 1); |
1406 | xfer->xfer32(device, span, 1, &alpha); |
1407 | y += 1; |
1408 | device = (uint32_t*)((char*)device + deviceRB); |
1409 | } while (--height > 0); |
1410 | } else { |
1411 | SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend; |
1412 | do { |
1413 | fShaderContext->shadeSpan(x, y, span, 1); |
1414 | proc(device, span, 1, alpha); |
1415 | y += 1; |
1416 | device = (uint32_t*)((char*)device + deviceRB); |
1417 | } while (--height > 0); |
1418 | } |
1419 | } |
1420 | } |
1421 | |