1/*
2 * Copyright 2006 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "include/core/SkShader.h"
9#include "include/private/SkColorData.h"
10#include "include/private/SkVx.h"
11#include "src/core/SkCoreBlitters.h"
12#include "src/core/SkUtils.h"
13#include "src/core/SkXfermodePriv.h"
14
15static inline int upscale_31_to_32(int value) {
16 SkASSERT((unsigned)value <= 31);
17 return value + (value >> 4);
18}
19
20static inline int blend_32(int src, int dst, int scale) {
21 SkASSERT((unsigned)src <= 0xFF);
22 SkASSERT((unsigned)dst <= 0xFF);
23 SkASSERT((unsigned)scale <= 32);
24 return dst + ((src - dst) * scale >> 5);
25}
26
27static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
28 SkPMColor dst, uint16_t mask) {
29 if (mask == 0) {
30 return dst;
31 }
32
33 /* We want all of these in 5bits, hence the shifts in case one of them
34 * (green) is 6bits.
35 */
36 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
37 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
38 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
39
40 // Now upscale them to 0..32, so we can use blend32
41 maskR = upscale_31_to_32(maskR);
42 maskG = upscale_31_to_32(maskG);
43 maskB = upscale_31_to_32(maskB);
44
45 // srcA has been upscaled to 256 before passed into this function
46 maskR = maskR * srcA >> 8;
47 maskG = maskG * srcA >> 8;
48 maskB = maskB * srcA >> 8;
49
50 int dstR = SkGetPackedR32(dst);
51 int dstG = SkGetPackedG32(dst);
52 int dstB = SkGetPackedB32(dst);
53
54 // LCD blitting is only supported if the dst is known/required
55 // to be opaque
56 return SkPackARGB32(0xFF,
57 blend_32(srcR, dstR, maskR),
58 blend_32(srcG, dstG, maskG),
59 blend_32(srcB, dstB, maskB));
60}
61
62static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
63 SkPMColor dst, uint16_t mask,
64 SkPMColor opaqueDst) {
65 if (mask == 0) {
66 return dst;
67 }
68
69 if (0xFFFF == mask) {
70 return opaqueDst;
71 }
72
73 /* We want all of these in 5bits, hence the shifts in case one of them
74 * (green) is 6bits.
75 */
76 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
77 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
78 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
79
80 // Now upscale them to 0..32, so we can use blend32
81 maskR = upscale_31_to_32(maskR);
82 maskG = upscale_31_to_32(maskG);
83 maskB = upscale_31_to_32(maskB);
84
85 int dstR = SkGetPackedR32(dst);
86 int dstG = SkGetPackedG32(dst);
87 int dstB = SkGetPackedB32(dst);
88
89 // LCD blitting is only supported if the dst is known/required
90 // to be opaque
91 return SkPackARGB32(0xFF,
92 blend_32(srcR, dstR, maskR),
93 blend_32(srcG, dstG, maskG),
94 blend_32(srcB, dstB, maskB));
95}
96
97
98// TODO: rewrite at least the SSE code here. It's miserable.
99
100#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
101 #include <emmintrin.h>
102
103 // The following (left) shifts cause the top 5 bits of the mask components to
104 // line up with the corresponding components in an SkPMColor.
105 // Note that the mask's RGB16 order may differ from the SkPMColor order.
106 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
107 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
108 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
109
110 #if SK_R16x5_R32x5_SHIFT == 0
111 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
112 #elif SK_R16x5_R32x5_SHIFT > 0
113 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
114 #else
115 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
116 #endif
117
118 #if SK_G16x5_G32x5_SHIFT == 0
119 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
120 #elif SK_G16x5_G32x5_SHIFT > 0
121 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
122 #else
123 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
124 #endif
125
126 #if SK_B16x5_B32x5_SHIFT == 0
127 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
128 #elif SK_B16x5_B32x5_SHIFT > 0
129 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
130 #else
131 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
132 #endif
133
134 static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
135 // In the following comments, the components of src, dst and mask are
136 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
137 // by an R, G, B, or A suffix. Components of one of the four pixels that
138 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
139 // example is the blue channel of the second destination pixel. Memory
140 // layout is shown for an ARGB byte order in a color value.
141
142 // src and srcA store 8-bit values interleaved with zeros.
143 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
144 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
145 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
146 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
147 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
148 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
149 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
150
151 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
152 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
153 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
154 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
155
156 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
157 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
158 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
159
160 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
161 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
162 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
163
164 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
165 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
166 // 8-bit position
167 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
168 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
169 mask = _mm_or_si128(_mm_or_si128(r, g), b);
170
171 // Interleave R,G,B into the lower byte of word.
172 // i.e. split the sixteen 8-bit values from mask into two sets of eight
173 // 16-bit values, padded by zero.
174 __m128i maskLo, maskHi;
175 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
176 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
177 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
178 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
179
180 // Upscale from 0..31 to 0..32
181 // (allows to replace division by left-shift further down)
182 // Left-shift each component by 4 and add the result back to that component,
183 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
184 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
185 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
186
187 // Multiply each component of maskLo and maskHi by srcA
188 maskLo = _mm_mullo_epi16(maskLo, srcA);
189 maskHi = _mm_mullo_epi16(maskHi, srcA);
190
191 // Left shift mask components by 8 (divide by 256)
192 maskLo = _mm_srli_epi16(maskLo, 8);
193 maskHi = _mm_srli_epi16(maskHi, 8);
194
195 // Interleave R,G,B into the lower byte of the word
196 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
197 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
198 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
199 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
200
201 // mask = (src - dst) * mask
202 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
203 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
204
205 // mask = (src - dst) * mask >> 5
206 maskLo = _mm_srai_epi16(maskLo, 5);
207 maskHi = _mm_srai_epi16(maskHi, 5);
208
209 // Add two pixels into result.
210 // result = dst + ((src - dst) * mask >> 5)
211 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
212 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
213
214 // Pack into 4 32bit dst pixels.
215 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
216 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
217 // clamping to 255 if necessary.
218 return _mm_packus_epi16(resultLo, resultHi);
219 }
220
221 static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
222 // In the following comments, the components of src, dst and mask are
223 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
224 // by an R, G, B, or A suffix. Components of one of the four pixels that
225 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
226 // example is the blue channel of the second destination pixel. Memory
227 // layout is shown for an ARGB byte order in a color value.
228
229 // src and srcA store 8-bit values interleaved with zeros.
230 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
231 // mask stores 16-bit values (shown as high and low bytes) interleaved with
232 // zeros
233 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
234 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
235
236 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
237 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
238 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
239 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
240
241 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
242 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
243 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
244
245 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
246 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
247 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
248
249 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
250 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
251 // 8-bit position
252 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
253 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
254 mask = _mm_or_si128(_mm_or_si128(r, g), b);
255
256 // Interleave R,G,B into the lower byte of word.
257 // i.e. split the sixteen 8-bit values from mask into two sets of eight
258 // 16-bit values, padded by zero.
259 __m128i maskLo, maskHi;
260 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
261 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
262 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
263 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
264
265 // Upscale from 0..31 to 0..32
266 // (allows to replace division by left-shift further down)
267 // Left-shift each component by 4 and add the result back to that component,
268 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
269 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
270 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
271
272 // Interleave R,G,B into the lower byte of the word
273 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
274 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
275 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
276 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
277
278 // mask = (src - dst) * mask
279 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
280 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
281
282 // mask = (src - dst) * mask >> 5
283 maskLo = _mm_srai_epi16(maskLo, 5);
284 maskHi = _mm_srai_epi16(maskHi, 5);
285
286 // Add two pixels into result.
287 // result = dst + ((src - dst) * mask >> 5)
288 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
289 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
290
291 // Pack into 4 32bit dst pixels and force opaque.
292 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
293 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
294 // clamping to 255 if necessary. Set alpha components to 0xFF.
295 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
296 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
297 }
298
299 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
300 if (width <= 0) {
301 return;
302 }
303
304 int srcA = SkColorGetA(src);
305 int srcR = SkColorGetR(src);
306 int srcG = SkColorGetG(src);
307 int srcB = SkColorGetB(src);
308
309 srcA = SkAlpha255To256(srcA);
310
311 if (width >= 4) {
312 SkASSERT(((size_t)dst & 0x03) == 0);
313 while (((size_t)dst & 0x0F) != 0) {
314 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
315 mask++;
316 dst++;
317 width--;
318 }
319
320 __m128i *d = reinterpret_cast<__m128i*>(dst);
321 // Set alpha to 0xFF and replicate source four times in SSE register.
322 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
323 // Interleave with zeros to get two sets of four 16-bit values.
324 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
325 // Set srcA_sse to contain eight copies of srcA, padded with zero.
326 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
327 __m128i srcA_sse = _mm_set1_epi16(srcA);
328 while (width >= 4) {
329 // Load four destination pixels into dst_sse.
330 __m128i dst_sse = _mm_load_si128(d);
331 // Load four 16-bit masks into lower half of mask_sse.
332 __m128i mask_sse = _mm_loadl_epi64(
333 reinterpret_cast<const __m128i*>(mask));
334
335 // Check whether masks are equal to 0 and get the highest bit
336 // of each byte of result, if masks are all zero, we will get
337 // pack_cmp to 0xFFFF
338 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
339 _mm_setzero_si128()));
340
341 // if mask pixels are not all zero, we will blend the dst pixels
342 if (pack_cmp != 0xFFFF) {
343 // Unpack 4 16bit mask pixels to
344 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
345 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
346 mask_sse = _mm_unpacklo_epi16(mask_sse,
347 _mm_setzero_si128());
348
349 // Process 4 32bit dst pixels
350 __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
351 _mm_store_si128(d, result);
352 }
353
354 d++;
355 mask += 4;
356 width -= 4;
357 }
358
359 dst = reinterpret_cast<SkPMColor*>(d);
360 }
361
362 while (width > 0) {
363 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
364 mask++;
365 dst++;
366 width--;
367 }
368 }
369
370 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
371 SkColor src, int width, SkPMColor opaqueDst) {
372 if (width <= 0) {
373 return;
374 }
375
376 int srcR = SkColorGetR(src);
377 int srcG = SkColorGetG(src);
378 int srcB = SkColorGetB(src);
379
380 if (width >= 4) {
381 SkASSERT(((size_t)dst & 0x03) == 0);
382 while (((size_t)dst & 0x0F) != 0) {
383 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
384 mask++;
385 dst++;
386 width--;
387 }
388
389 __m128i *d = reinterpret_cast<__m128i*>(dst);
390 // Set alpha to 0xFF and replicate source four times in SSE register.
391 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
392 // Set srcA_sse to contain eight copies of srcA, padded with zero.
393 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
395 while (width >= 4) {
396 // Load four destination pixels into dst_sse.
397 __m128i dst_sse = _mm_load_si128(d);
398 // Load four 16-bit masks into lower half of mask_sse.
399 __m128i mask_sse = _mm_loadl_epi64(
400 reinterpret_cast<const __m128i*>(mask));
401
402 // Check whether masks are equal to 0 and get the highest bit
403 // of each byte of result, if masks are all zero, we will get
404 // pack_cmp to 0xFFFF
405 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
406 _mm_setzero_si128()));
407
408 // if mask pixels are not all zero, we will blend the dst pixels
409 if (pack_cmp != 0xFFFF) {
410 // Unpack 4 16bit mask pixels to
411 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
412 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
413 mask_sse = _mm_unpacklo_epi16(mask_sse,
414 _mm_setzero_si128());
415
416 // Process 4 32bit dst pixels
417 __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
418 _mm_store_si128(d, result);
419 }
420
421 d++;
422 mask += 4;
423 width -= 4;
424 }
425
426 dst = reinterpret_cast<SkPMColor*>(d);
427 }
428
429 while (width > 0) {
430 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
431 mask++;
432 dst++;
433 width--;
434 }
435 }
436
437#elif defined(SK_ARM_HAS_NEON)
438 #include <arm_neon.h>
439
440 #define NEON_A (SK_A32_SHIFT / 8)
441 #define NEON_R (SK_R32_SHIFT / 8)
442 #define NEON_G (SK_G32_SHIFT / 8)
443 #define NEON_B (SK_B32_SHIFT / 8)
444
445 static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
446 int16x8_t src_wide, dst_wide;
447
448 src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
449 dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
450
451 src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
452
453 dst_wide += vshrq_n_s16(src_wide, 5);
454
455 return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
456 }
457
458 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
459 SkColor color, int width,
460 SkPMColor opaqueDst) {
461 int colR = SkColorGetR(color);
462 int colG = SkColorGetG(color);
463 int colB = SkColorGetB(color);
464
465 uint8x8_t vcolR = vdup_n_u8(colR);
466 uint8x8_t vcolG = vdup_n_u8(colG);
467 uint8x8_t vcolB = vdup_n_u8(colB);
468 uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
469 uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
470 uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
471 uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
472
473 while (width >= 8) {
474 uint8x8x4_t vdst;
475 uint16x8_t vmask;
476 uint16x8_t vmaskR, vmaskG, vmaskB;
477 uint8x8_t vsel_trans, vsel_opq;
478
479 vdst = vld4_u8((uint8_t*)dst);
480 vmask = vld1q_u16(src);
481
482 // Prepare compare masks
483 vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
484 vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
485
486 // Get all the color masks on 5 bits
487 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
488 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
489 SK_B16_BITS + SK_R16_BITS + 1);
490 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
491
492 // Upscale to 0..32
493 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
494 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
495 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
496
497 vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
498 vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
499
500 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
501 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
502 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
503
504 vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
505 vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
506 vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
507
508 vst4_u8((uint8_t*)dst, vdst);
509
510 dst += 8;
511 src += 8;
512 width -= 8;
513 }
514
515 // Leftovers
516 for (int i = 0; i < width; i++) {
517 dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
518 }
519 }
520
521 void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
522 SkColor color, int width, SkPMColor) {
523 int colA = SkColorGetA(color);
524 int colR = SkColorGetR(color);
525 int colG = SkColorGetG(color);
526 int colB = SkColorGetB(color);
527
528 colA = SkAlpha255To256(colA);
529
530 uint16x8_t vcolA = vdupq_n_u16(colA);
531 uint8x8_t vcolR = vdup_n_u8(colR);
532 uint8x8_t vcolG = vdup_n_u8(colG);
533 uint8x8_t vcolB = vdup_n_u8(colB);
534
535 while (width >= 8) {
536 uint8x8x4_t vdst;
537 uint16x8_t vmask;
538 uint16x8_t vmaskR, vmaskG, vmaskB;
539
540 vdst = vld4_u8((uint8_t*)dst);
541 vmask = vld1q_u16(src);
542
543 // Get all the color masks on 5 bits
544 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
545 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
546 SK_B16_BITS + SK_R16_BITS + 1);
547 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
548
549 // Upscale to 0..32
550 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
551 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
552 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
553
554 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
555 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
556 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
557
558 vdst.val[NEON_A] = vdup_n_u8(0xFF);
559 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
560 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
561 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
562
563 vst4_u8((uint8_t*)dst, vdst);
564
565 dst += 8;
566 src += 8;
567 width -= 8;
568 }
569
570 for (int i = 0; i < width; i++) {
571 dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
572 }
573 }
574
575#else
576
577 static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
578 SkColor src, int width, SkPMColor) {
579 int srcA = SkColorGetA(src);
580 int srcR = SkColorGetR(src);
581 int srcG = SkColorGetG(src);
582 int srcB = SkColorGetB(src);
583
584 srcA = SkAlpha255To256(srcA);
585
586 for (int i = 0; i < width; i++) {
587 dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
588 }
589 }
590
591 static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
592 SkColor src, int width,
593 SkPMColor opaqueDst) {
594 int srcR = SkColorGetR(src);
595 int srcG = SkColorGetG(src);
596 int srcB = SkColorGetB(src);
597
598 for (int i = 0; i < width; i++) {
599 dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
600 }
601 }
602
603#endif
604
605static bool blit_color(const SkPixmap& device,
606 const SkMask& mask,
607 const SkIRect& clip,
608 SkColor color) {
609 int x = clip.fLeft,
610 y = clip.fTop;
611
612 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
613 SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
614 (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
615 color, clip.width(), clip.height());
616 return true;
617 }
618
619 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
620 auto dstRow = device.writable_addr32(x,y);
621 auto maskRow = (const uint16_t*)mask.getAddr(x,y);
622
623 auto blit_row = blit_row_lcd16;
624 SkPMColor opaqueDst = 0; // ignored unless opaque
625
626 if (0xff == SkColorGetA(color)) {
627 blit_row = blit_row_lcd16_opaque;
628 opaqueDst = SkPreMultiplyColor(color);
629 }
630
631 for (int height = clip.height(); height --> 0; ) {
632 blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
633
634 dstRow = (SkPMColor*) (( char*) dstRow + device.rowBytes());
635 maskRow = (const uint16_t*)((const char*)maskRow + mask.fRowBytes);
636 }
637 return true;
638 }
639
640 return false;
641}
642
643///////////////////////////////////////////////////////////////////////////////
644
645static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
646 const SkIRect& clip, SkPMColor srcColor) {
647 U8CPU alpha = SkGetPackedA32(srcColor);
648 unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
649 if (alpha != 255) {
650 flags |= SkBlitRow::kGlobalAlpha_Flag32;
651 }
652 SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
653
654 int x = clip.fLeft;
655 int y = clip.fTop;
656 int width = clip.width();
657 int height = clip.height();
658
659 SkPMColor* dstRow = device.writable_addr32(x, y);
660 const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
661
662 do {
663 proc(dstRow, srcRow, width, alpha);
664 dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
665 srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
666 } while (--height != 0);
667}
668
669//////////////////////////////////////////////////////////////////////////////////////
670
671SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
672 : INHERITED(device) {
673 SkColor color = paint.getColor();
674 fColor = color;
675
676 fSrcA = SkColorGetA(color);
677 unsigned scale = SkAlpha255To256(fSrcA);
678 fSrcR = SkAlphaMul(SkColorGetR(color), scale);
679 fSrcG = SkAlphaMul(SkColorGetG(color), scale);
680 fSrcB = SkAlphaMul(SkColorGetB(color), scale);
681
682 fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
683}
684
685const SkPixmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
686 if (255 == fSrcA) {
687 *value = fPMColor;
688 return &fDevice;
689 }
690 return nullptr;
691}
692
693#if defined _WIN32 // disable warning : local variable used without having been initialized
694#pragma warning ( push )
695#pragma warning ( disable : 4701 )
696#endif
697
698void SkARGB32_Blitter::blitH(int x, int y, int width) {
699 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
700
701 uint32_t* device = fDevice.writable_addr32(x, y);
702 SkBlitRow::Color32(device, device, width, fPMColor);
703}
704
705void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
706 const int16_t runs[]) {
707 if (fSrcA == 0) {
708 return;
709 }
710
711 uint32_t color = fPMColor;
712 uint32_t* device = fDevice.writable_addr32(x, y);
713 unsigned opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
714
715 for (;;) {
716 int count = runs[0];
717 SkASSERT(count >= 0);
718 if (count <= 0) {
719 return;
720 }
721 unsigned aa = antialias[0];
722 if (aa) {
723 if ((opaqueMask & aa) == 255) {
724 sk_memset32(device, color, count);
725 } else {
726 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
727 SkBlitRow::Color32(device, device, count, sc);
728 }
729 }
730 runs += count;
731 antialias += count;
732 device += count;
733 }
734}
735
736void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
737 uint32_t* device = fDevice.writable_addr32(x, y);
738 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
739
740 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
741 device[1] = SkBlendARGB32(fPMColor, device[1], a1);
742}
743
744void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
745 uint32_t* device = fDevice.writable_addr32(x, y);
746 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
747
748 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
749 device = (uint32_t*)((char*)device + fDevice.rowBytes());
750 device[0] = SkBlendARGB32(fPMColor, device[0], a1);
751}
752
753//////////////////////////////////////////////////////////////////////////////////////
754
755#define solid_8_pixels(mask, dst, color) \
756 do { \
757 if (mask & 0x80) dst[0] = color; \
758 if (mask & 0x40) dst[1] = color; \
759 if (mask & 0x20) dst[2] = color; \
760 if (mask & 0x10) dst[3] = color; \
761 if (mask & 0x08) dst[4] = color; \
762 if (mask & 0x04) dst[5] = color; \
763 if (mask & 0x02) dst[6] = color; \
764 if (mask & 0x01) dst[7] = color; \
765 } while (0)
766
767#define SK_BLITBWMASK_NAME SkARGB32_BlitBW
768#define SK_BLITBWMASK_ARGS , SkPMColor color
769#define SK_BLITBWMASK_BLIT8(mask, dst) solid_8_pixels(mask, dst, color)
770#define SK_BLITBWMASK_GETADDR writable_addr32
771#define SK_BLITBWMASK_DEVTYPE uint32_t
772#include "src/core/SkBlitBWMaskTemplate.h"
773
774#define blend_8_pixels(mask, dst, sc, dst_scale) \
775 do { \
776 if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); } \
777 if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); } \
778 if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); } \
779 if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); } \
780 if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); } \
781 if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); } \
782 if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); } \
783 if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); } \
784 } while (0)
785
786#define SK_BLITBWMASK_NAME SkARGB32_BlendBW
787#define SK_BLITBWMASK_ARGS , uint32_t sc, unsigned dst_scale
788#define SK_BLITBWMASK_BLIT8(mask, dst) blend_8_pixels(mask, dst, sc, dst_scale)
789#define SK_BLITBWMASK_GETADDR writable_addr32
790#define SK_BLITBWMASK_DEVTYPE uint32_t
791#include "src/core/SkBlitBWMaskTemplate.h"
792
793void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
794 SkASSERT(mask.fBounds.contains(clip));
795 SkASSERT(fSrcA != 0xFF);
796
797 if (fSrcA == 0) {
798 return;
799 }
800
801 if (blit_color(fDevice, mask, clip, fColor)) {
802 return;
803 }
804
805 switch (mask.fFormat) {
806 case SkMask::kBW_Format:
807 SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
808 break;
809 case SkMask::kARGB32_Format:
810 SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
811 break;
812 default:
813 SK_ABORT("Mask format not handled.");
814 }
815}
816
817void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
818 const SkIRect& clip) {
819 SkASSERT(mask.fBounds.contains(clip));
820
821 if (blit_color(fDevice, mask, clip, fColor)) {
822 return;
823 }
824
825 switch (mask.fFormat) {
826 case SkMask::kBW_Format:
827 SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
828 break;
829 case SkMask::kARGB32_Format:
830 SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
831 break;
832 default:
833 SK_ABORT("Mask format not handled.");
834 }
835}
836
837void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
838 uint32_t* device = fDevice.writable_addr32(x, y);
839 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
840
841 device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
842 device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
843}
844
845void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
846 uint32_t* device = fDevice.writable_addr32(x, y);
847 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
848
849 device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
850 device = (uint32_t*)((char*)device + fDevice.rowBytes());
851 device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
852}
853
854///////////////////////////////////////////////////////////////////////////////
855
856void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
857 if (alpha == 0 || fSrcA == 0) {
858 return;
859 }
860
861 uint32_t* device = fDevice.writable_addr32(x, y);
862 uint32_t color = fPMColor;
863
864 if (alpha != 255) {
865 color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
866 }
867
868 unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
869 size_t rowBytes = fDevice.rowBytes();
870 while (--height >= 0) {
871 device[0] = color + SkAlphaMulQ(device[0], dst_scale);
872 device = (uint32_t*)((char*)device + rowBytes);
873 }
874}
875
876void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
877 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
878
879 if (fSrcA == 0) {
880 return;
881 }
882
883 uint32_t* device = fDevice.writable_addr32(x, y);
884 uint32_t color = fPMColor;
885 size_t rowBytes = fDevice.rowBytes();
886
887 if (SkGetPackedA32(fPMColor) == 0xFF) {
888 SkOpts::rect_memset32(device, color, width, rowBytes, height);
889 } else {
890 while (height --> 0) {
891 SkBlitRow::Color32(device, device, width, color);
892 device = (uint32_t*)((char*)device + rowBytes);
893 }
894 }
895}
896
897#if defined _WIN32
898#pragma warning ( pop )
899#endif
900
901///////////////////////////////////////////////////////////////////////
902
903void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
904 const int16_t runs[]) {
905 uint32_t* device = fDevice.writable_addr32(x, y);
906 SkPMColor black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
907
908 for (;;) {
909 int count = runs[0];
910 SkASSERT(count >= 0);
911 if (count <= 0) {
912 return;
913 }
914 unsigned aa = antialias[0];
915 if (aa) {
916 if (aa == 255) {
917 sk_memset32(device, black, count);
918 } else {
919 SkPMColor src = aa << SK_A32_SHIFT;
920 unsigned dst_scale = 256 - aa;
921 int n = count;
922 do {
923 --n;
924 device[n] = src + SkAlphaMulQ(device[n], dst_scale);
925 } while (n > 0);
926 }
927 }
928 runs += count;
929 antialias += count;
930 device += count;
931 }
932}
933
934void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
935 uint32_t* device = fDevice.writable_addr32(x, y);
936 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
937
938 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
939 device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
940}
941
942void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
943 uint32_t* device = fDevice.writable_addr32(x, y);
944 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
945
946 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
947 device = (uint32_t*)((char*)device + fDevice.rowBytes());
948 device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
949}
950
951///////////////////////////////////////////////////////////////////////////////
952
953// Special version of SkBlitRow::Factory32 that knows we're in kSrc_Mode,
954// instead of kSrcOver_Mode
955static void blend_srcmode(SkPMColor* SK_RESTRICT device,
956 const SkPMColor* SK_RESTRICT span,
957 int count, U8CPU aa) {
958 int aa256 = SkAlpha255To256(aa);
959 for (int i = 0; i < count; ++i) {
960 device[i] = SkFourByteInterp256(span[i], device[i], aa256);
961 }
962}
963
964SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
965 const SkPaint& paint, SkShaderBase::Context* shaderContext)
966 : INHERITED(device, paint, shaderContext)
967{
968 fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
969
970 fXfermode = SkXfermode::Peek(paint.getBlendMode());
971
972 int flags = 0;
973 if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
974 flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
975 }
976 // we call this on the output from the shader
977 fProc32 = SkBlitRow::Factory32(flags);
978 // we call this on the output from the shader + alpha from the aa buffer
979 fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
980
981 fShadeDirectlyIntoDevice = false;
982 if (fXfermode == nullptr) {
983 if (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag) {
984 fShadeDirectlyIntoDevice = true;
985 }
986 } else {
987 if (SkBlendMode::kSrc == paint.getBlendMode()) {
988 fShadeDirectlyIntoDevice = true;
989 fProc32Blend = blend_srcmode;
990 }
991 }
992
993 fConstInY = SkToBool(shaderContext->getFlags() & SkShaderBase::kConstInY32_Flag);
994}
995
996SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
997 sk_free(fBuffer);
998}
999
1000void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
1001 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1002
1003 uint32_t* device = fDevice.writable_addr32(x, y);
1004
1005 if (fShadeDirectlyIntoDevice) {
1006 fShaderContext->shadeSpan(x, y, device, width);
1007 } else {
1008 SkPMColor* span = fBuffer;
1009 fShaderContext->shadeSpan(x, y, span, width);
1010 if (fXfermode) {
1011 fXfermode->xfer32(device, span, width, nullptr);
1012 } else {
1013 fProc32(device, span, width, 255);
1014 }
1015 }
1016}
1017
1018void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1019 SkASSERT(x >= 0 && y >= 0 &&
1020 x + width <= fDevice.width() && y + height <= fDevice.height());
1021
1022 uint32_t* device = fDevice.writable_addr32(x, y);
1023 size_t deviceRB = fDevice.rowBytes();
1024 auto* shaderContext = fShaderContext;
1025 SkPMColor* span = fBuffer;
1026
1027 if (fConstInY) {
1028 if (fShadeDirectlyIntoDevice) {
1029 // shade the first row directly into the device
1030 shaderContext->shadeSpan(x, y, device, width);
1031 span = device;
1032 while (--height > 0) {
1033 device = (uint32_t*)((char*)device + deviceRB);
1034 memcpy(device, span, width << 2);
1035 }
1036 } else {
1037 shaderContext->shadeSpan(x, y, span, width);
1038 SkXfermode* xfer = fXfermode;
1039 if (xfer) {
1040 do {
1041 xfer->xfer32(device, span, width, nullptr);
1042 y += 1;
1043 device = (uint32_t*)((char*)device + deviceRB);
1044 } while (--height > 0);
1045 } else {
1046 SkBlitRow::Proc32 proc = fProc32;
1047 do {
1048 proc(device, span, width, 255);
1049 y += 1;
1050 device = (uint32_t*)((char*)device + deviceRB);
1051 } while (--height > 0);
1052 }
1053 }
1054 return;
1055 }
1056
1057 if (fShadeDirectlyIntoDevice) {
1058 do {
1059 shaderContext->shadeSpan(x, y, device, width);
1060 y += 1;
1061 device = (uint32_t*)((char*)device + deviceRB);
1062 } while (--height > 0);
1063 } else {
1064 SkXfermode* xfer = fXfermode;
1065 if (xfer) {
1066 do {
1067 shaderContext->shadeSpan(x, y, span, width);
1068 xfer->xfer32(device, span, width, nullptr);
1069 y += 1;
1070 device = (uint32_t*)((char*)device + deviceRB);
1071 } while (--height > 0);
1072 } else {
1073 SkBlitRow::Proc32 proc = fProc32;
1074 do {
1075 shaderContext->shadeSpan(x, y, span, width);
1076 proc(device, span, width, 255);
1077 y += 1;
1078 device = (uint32_t*)((char*)device + deviceRB);
1079 } while (--height > 0);
1080 }
1081 }
1082}
1083
1084void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1085 const int16_t runs[]) {
1086 SkPMColor* span = fBuffer;
1087 uint32_t* device = fDevice.writable_addr32(x, y);
1088 auto* shaderContext = fShaderContext;
1089
1090 if (fXfermode && !fShadeDirectlyIntoDevice) {
1091 for (;;) {
1092 SkXfermode* xfer = fXfermode;
1093
1094 int count = *runs;
1095 if (count <= 0)
1096 break;
1097 int aa = *antialias;
1098 if (aa) {
1099 shaderContext->shadeSpan(x, y, span, count);
1100 if (aa == 255) {
1101 xfer->xfer32(device, span, count, nullptr);
1102 } else {
1103 // count is almost always 1
1104 for (int i = count - 1; i >= 0; --i) {
1105 xfer->xfer32(&device[i], &span[i], 1, antialias);
1106 }
1107 }
1108 }
1109 device += count;
1110 runs += count;
1111 antialias += count;
1112 x += count;
1113 }
1114 } else if (fShadeDirectlyIntoDevice ||
1115 (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1116 for (;;) {
1117 int count = *runs;
1118 if (count <= 0) {
1119 break;
1120 }
1121 int aa = *antialias;
1122 if (aa) {
1123 if (aa == 255) {
1124 // cool, have the shader draw right into the device
1125 shaderContext->shadeSpan(x, y, device, count);
1126 } else {
1127 shaderContext->shadeSpan(x, y, span, count);
1128 fProc32Blend(device, span, count, aa);
1129 }
1130 }
1131 device += count;
1132 runs += count;
1133 antialias += count;
1134 x += count;
1135 }
1136 } else {
1137 for (;;) {
1138 int count = *runs;
1139 if (count <= 0) {
1140 break;
1141 }
1142 int aa = *antialias;
1143 if (aa) {
1144 shaderContext->shadeSpan(x, y, span, count);
1145 if (aa == 255) {
1146 fProc32(device, span, count, 255);
1147 } else {
1148 fProc32Blend(device, span, count, aa);
1149 }
1150 }
1151 device += count;
1152 runs += count;
1153 antialias += count;
1154 x += count;
1155 }
1156 }
1157}
1158
1159using U32 = skvx::Vec< 4, uint32_t>;
1160using U8x4 = skvx::Vec<16, uint8_t>;
1161using U8 = skvx::Vec< 4, uint8_t>;
1162
1163static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1164 U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1165
1166 auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1167 U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1168 return skvx::bit_pun<U32>(kernel(skvx::bit_pun<U8x4>(dst),
1169 skvx::bit_pun<U8x4>(src),
1170 cov_splat));
1171 };
1172 while (n >= 4) {
1173 apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1174 dst += 4;
1175 src += 4;
1176 cov += 4;
1177 n -= 4;
1178 }
1179 while (n --> 0) {
1180 *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1181 dst++;
1182 src++;
1183 cov++;
1184 }
1185}
1186
1187static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1188 auto cov = (const uint8_t*)mask;
1189 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1190 U8x4 s_aa = skvx::approx_scale(s, c),
1191 alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1192 return s_aa + skvx::approx_scale(d, 255 - alpha);
1193 });
1194}
1195
1196static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1197 auto cov = (const uint8_t*)mask;
1198 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1199 return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>( c )
1200 + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1201 });
1202}
1203
1204static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1205 auto src_alpha_blend = [](int s, int d, int sa, int m) {
1206 return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1207 };
1208
1209 auto upscale_31_to_255 = [](int v) {
1210 return (v << 3) | (v >> 2);
1211 };
1212
1213 auto mask = (const uint16_t*)vmask;
1214 for (int i = 0; i < n; ++i) {
1215 uint16_t m = mask[i];
1216 if (0 == m) {
1217 continue;
1218 }
1219
1220 SkPMColor s = src[i];
1221 SkPMColor d = dst[i];
1222
1223 int srcA = SkGetPackedA32(s);
1224 int srcR = SkGetPackedR32(s);
1225 int srcG = SkGetPackedG32(s);
1226 int srcB = SkGetPackedB32(s);
1227
1228 srcA += srcA >> 7;
1229
1230 // We're ignoring the least significant bit of the green coverage channel here.
1231 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1232 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1233 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1234
1235 // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1236 maskR = upscale_31_to_255(maskR);
1237 maskG = upscale_31_to_255(maskG);
1238 maskB = upscale_31_to_255(maskB);
1239
1240 // This LCD blit routine only works if the destination is opaque.
1241 dst[i] = SkPackARGB32(0xFF,
1242 src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1243 src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1244 src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1245 }
1246}
1247
1248static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1249 auto mask = (const uint16_t*)vmask;
1250
1251 for (int i = 0; i < n; ++i) {
1252 uint16_t m = mask[i];
1253 if (0 == m) {
1254 continue;
1255 }
1256
1257 SkPMColor s = src[i];
1258 SkPMColor d = dst[i];
1259
1260 int srcR = SkGetPackedR32(s);
1261 int srcG = SkGetPackedG32(s);
1262 int srcB = SkGetPackedB32(s);
1263
1264 // We're ignoring the least significant bit of the green coverage channel here.
1265 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1266 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1267 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1268
1269 // Now upscale them to 0..32, so we can use blend_32.
1270 maskR = upscale_31_to_32(maskR);
1271 maskG = upscale_31_to_32(maskG);
1272 maskB = upscale_31_to_32(maskB);
1273
1274 // This LCD blit routine only works if the destination is opaque.
1275 dst[i] = SkPackARGB32(0xFF,
1276 blend_32(srcR, SkGetPackedR32(d), maskR),
1277 blend_32(srcG, SkGetPackedG32(d), maskG),
1278 blend_32(srcB, SkGetPackedB32(d), maskB));
1279 }
1280}
1281
1282void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1283 // we only handle kA8 with an xfermode
1284 if (fXfermode && (SkMask::kA8_Format != mask.fFormat)) {
1285 this->INHERITED::blitMask(mask, clip);
1286 return;
1287 }
1288
1289 SkASSERT(mask.fBounds.contains(clip));
1290
1291 void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1292
1293 if (!fXfermode) {
1294 bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1295
1296 if (mask.fFormat == SkMask::kA8_Format && opaque) {
1297 blend_row = blend_row_A8_opaque;
1298 } else if (mask.fFormat == SkMask::kA8_Format) {
1299 blend_row = blend_row_A8;
1300 } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1301 blend_row = blend_row_LCD16_opaque;
1302 } else if (mask.fFormat == SkMask::kLCD16_Format) {
1303 blend_row = blend_row_lcd16;
1304 } else {
1305 this->INHERITED::blitMask(mask, clip);
1306 return;
1307 }
1308 }
1309
1310 const int x = clip.fLeft;
1311 const int width = clip.width();
1312 int y = clip.fTop;
1313 int height = clip.height();
1314
1315 char* dstRow = (char*)fDevice.writable_addr32(x, y);
1316 const size_t dstRB = fDevice.rowBytes();
1317 const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1318 const size_t maskRB = mask.fRowBytes;
1319
1320 SkPMColor* span = fBuffer;
1321
1322 if (fXfermode) {
1323 SkASSERT(SkMask::kA8_Format == mask.fFormat);
1324 SkXfermode* xfer = fXfermode;
1325 do {
1326 fShaderContext->shadeSpan(x, y, span, width);
1327 xfer->xfer32(reinterpret_cast<SkPMColor*>(dstRow), span, width, maskRow);
1328 dstRow += dstRB;
1329 maskRow += maskRB;
1330 y += 1;
1331 } while (--height > 0);
1332 } else {
1333 SkASSERT(blend_row);
1334 do {
1335 fShaderContext->shadeSpan(x, y, span, width);
1336 blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1337 dstRow += dstRB;
1338 maskRow += maskRB;
1339 y += 1;
1340 } while (--height > 0);
1341 }
1342}
1343
1344void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1345 SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
1346
1347 uint32_t* device = fDevice.writable_addr32(x, y);
1348 size_t deviceRB = fDevice.rowBytes();
1349
1350 if (fConstInY) {
1351 SkPMColor c;
1352 fShaderContext->shadeSpan(x, y, &c, 1);
1353
1354 if (fShadeDirectlyIntoDevice) {
1355 if (255 == alpha) {
1356 do {
1357 *device = c;
1358 device = (uint32_t*)((char*)device + deviceRB);
1359 } while (--height > 0);
1360 } else {
1361 do {
1362 *device = SkFourByteInterp(c, *device, alpha);
1363 device = (uint32_t*)((char*)device + deviceRB);
1364 } while (--height > 0);
1365 }
1366 } else {
1367 SkXfermode* xfer = fXfermode;
1368 if (xfer) {
1369 do {
1370 xfer->xfer32(device, &c, 1, &alpha);
1371 device = (uint32_t*)((char*)device + deviceRB);
1372 } while (--height > 0);
1373 } else {
1374 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1375 do {
1376 proc(device, &c, 1, alpha);
1377 device = (uint32_t*)((char*)device + deviceRB);
1378 } while (--height > 0);
1379 }
1380 }
1381 return;
1382 }
1383
1384 if (fShadeDirectlyIntoDevice) {
1385 if (255 == alpha) {
1386 do {
1387 fShaderContext->shadeSpan(x, y, device, 1);
1388 y += 1;
1389 device = (uint32_t*)((char*)device + deviceRB);
1390 } while (--height > 0);
1391 } else {
1392 do {
1393 SkPMColor c;
1394 fShaderContext->shadeSpan(x, y, &c, 1);
1395 *device = SkFourByteInterp(c, *device, alpha);
1396 y += 1;
1397 device = (uint32_t*)((char*)device + deviceRB);
1398 } while (--height > 0);
1399 }
1400 } else {
1401 SkPMColor* span = fBuffer;
1402 SkXfermode* xfer = fXfermode;
1403 if (xfer) {
1404 do {
1405 fShaderContext->shadeSpan(x, y, span, 1);
1406 xfer->xfer32(device, span, 1, &alpha);
1407 y += 1;
1408 device = (uint32_t*)((char*)device + deviceRB);
1409 } while (--height > 0);
1410 } else {
1411 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1412 do {
1413 fShaderContext->shadeSpan(x, y, span, 1);
1414 proc(device, span, 1, alpha);
1415 y += 1;
1416 device = (uint32_t*)((char*)device + deviceRB);
1417 } while (--height > 0);
1418 }
1419 }
1420}
1421