1 | /* |
2 | * Copyright 2015 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkBlitMask_opts_DEFINED |
9 | #define SkBlitMask_opts_DEFINED |
10 | |
11 | #include "src/core/Sk4px.h" |
12 | |
13 | namespace SK_OPTS_NS { |
14 | |
15 | #if defined(SK_ARM_HAS_NEON) |
16 | // The Sk4px versions below will work fine with NEON, but we have had many indications |
17 | // that it doesn't perform as well as this NEON-specific code. TODO(mtklein): why? |
18 | |
19 | #define NEON_A (SK_A32_SHIFT / 8) |
20 | #define NEON_R (SK_R32_SHIFT / 8) |
21 | #define NEON_G (SK_G32_SHIFT / 8) |
22 | #define NEON_B (SK_B32_SHIFT / 8) |
23 | |
24 | static inline uint16x8_t SkAlpha255To256_neon8(uint8x8_t alpha) { |
25 | return vaddw_u8(vdupq_n_u16(1), alpha); |
26 | } |
27 | |
28 | static inline uint8x8_t SkAlphaMul_neon8(uint8x8_t color, uint16x8_t scale) { |
29 | return vshrn_n_u16(vmovl_u8(color) * scale, 8); |
30 | } |
31 | |
32 | static inline uint8x8x4_t SkAlphaMulQ_neon8(uint8x8x4_t color, uint16x8_t scale) { |
33 | uint8x8x4_t ret; |
34 | |
35 | ret.val[0] = SkAlphaMul_neon8(color.val[0], scale); |
36 | ret.val[1] = SkAlphaMul_neon8(color.val[1], scale); |
37 | ret.val[2] = SkAlphaMul_neon8(color.val[2], scale); |
38 | ret.val[3] = SkAlphaMul_neon8(color.val[3], scale); |
39 | |
40 | return ret; |
41 | } |
42 | |
43 | |
44 | template <bool isColor> |
45 | static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, |
46 | const void* SK_RESTRICT maskPtr, size_t maskRB, |
47 | SkColor color, int width, int height) { |
48 | SkPMColor pmc = SkPreMultiplyColor(color); |
49 | SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
50 | const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
51 | uint8x8x4_t vpmc; |
52 | |
53 | maskRB -= width; |
54 | dstRB -= (width << 2); |
55 | |
56 | if (width >= 8) { |
57 | vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); |
58 | vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); |
59 | vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); |
60 | vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); |
61 | } |
62 | do { |
63 | int w = width; |
64 | while (w >= 8) { |
65 | uint8x8_t vmask = vld1_u8(mask); |
66 | uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); |
67 | if (isColor) { |
68 | vscale = vsubw_u8(vdupq_n_u16(256), |
69 | SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); |
70 | } else { |
71 | vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
72 | } |
73 | uint8x8x4_t vdev = vld4_u8((uint8_t*)device); |
74 | |
75 | vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) |
76 | + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); |
77 | vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) |
78 | + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); |
79 | vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) |
80 | + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); |
81 | vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) |
82 | + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); |
83 | |
84 | vst4_u8((uint8_t*)device, vdev); |
85 | |
86 | mask += 8; |
87 | device += 8; |
88 | w -= 8; |
89 | } |
90 | |
91 | while (w--) { |
92 | unsigned aa = *mask++; |
93 | if (isColor) { |
94 | *device = SkBlendARGB32(pmc, *device, aa); |
95 | } else { |
96 | *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) |
97 | + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
98 | } |
99 | device += 1; |
100 | } |
101 | |
102 | device = (uint32_t*)((char*)device + dstRB); |
103 | mask += maskRB; |
104 | |
105 | } while (--height != 0); |
106 | } |
107 | |
108 | static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, |
109 | const SkAlpha* mask, size_t maskRB, |
110 | SkColor color, int w, int h) { |
111 | D32_A8_Opaque_Color_neon<true>(dst, dstRB, mask, maskRB, color, w, h); |
112 | } |
113 | |
114 | // As above, but made slightly simpler by requiring that color is opaque. |
115 | static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, |
116 | const SkAlpha* mask, size_t maskRB, |
117 | SkColor color, int w, int h) { |
118 | D32_A8_Opaque_Color_neon<false>(dst, dstRB, mask, maskRB, color, w, h); |
119 | } |
120 | |
121 | // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and even simpler case. |
122 | static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, |
123 | const SkAlpha* maskPtr, size_t maskRB, |
124 | int width, int height) { |
125 | SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
126 | const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
127 | |
128 | maskRB -= width; |
129 | dstRB -= (width << 2); |
130 | do { |
131 | int w = width; |
132 | while (w >= 8) { |
133 | uint8x8_t vmask = vld1_u8(mask); |
134 | uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
135 | uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); |
136 | |
137 | vdevice = SkAlphaMulQ_neon8(vdevice, vscale); |
138 | vdevice.val[NEON_A] += vmask; |
139 | |
140 | vst4_u8((uint8_t*)device, vdevice); |
141 | |
142 | mask += 8; |
143 | device += 8; |
144 | w -= 8; |
145 | } |
146 | while (w-- > 0) { |
147 | unsigned aa = *mask++; |
148 | *device = (aa << SK_A32_SHIFT) |
149 | + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
150 | device += 1; |
151 | } |
152 | device = (uint32_t*)((char*)device + dstRB); |
153 | mask += maskRB; |
154 | } while (--height != 0); |
155 | } |
156 | |
157 | #else |
158 | static void blit_mask_d32_a8_general(SkPMColor* dst, size_t dstRB, |
159 | const SkAlpha* mask, size_t maskRB, |
160 | SkColor color, int w, int h) { |
161 | auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); |
162 | auto fn = [&](const Sk4px& d, const Sk4px& aa) { |
163 | // = (s + d(1-sa))aa + d(1-aa) |
164 | // = s*aa + d(1-sa*aa) |
165 | auto left = s.approxMulDiv255(aa), |
166 | right = d.approxMulDiv255(left.alphas().inv()); |
167 | return left + right; // This does not overflow (exhaustively checked). |
168 | }; |
169 | while (h --> 0) { |
170 | Sk4px::MapDstAlpha(w, dst, mask, fn); |
171 | dst += dstRB / sizeof(*dst); |
172 | mask += maskRB / sizeof(*mask); |
173 | } |
174 | } |
175 | |
176 | // As above, but made slightly simpler by requiring that color is opaque. |
177 | static void blit_mask_d32_a8_opaque(SkPMColor* dst, size_t dstRB, |
178 | const SkAlpha* mask, size_t maskRB, |
179 | SkColor color, int w, int h) { |
180 | SkASSERT(SkColorGetA(color) == 0xFF); |
181 | auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color)); |
182 | auto fn = [&](const Sk4px& d, const Sk4px& aa) { |
183 | // = (s + d(1-sa))aa + d(1-aa) |
184 | // = s*aa + d(1-sa*aa) |
185 | // ~~~> |
186 | // = s*aa + d(1-aa) |
187 | return s.approxMulDiv255(aa) + d.approxMulDiv255(aa.inv()); |
188 | }; |
189 | while (h --> 0) { |
190 | Sk4px::MapDstAlpha(w, dst, mask, fn); |
191 | dst += dstRB / sizeof(*dst); |
192 | mask += maskRB / sizeof(*mask); |
193 | } |
194 | } |
195 | |
196 | // Same as _opaque, but assumes color == SK_ColorBLACK, a very common and even simpler case. |
197 | static void blit_mask_d32_a8_black(SkPMColor* dst, size_t dstRB, |
198 | const SkAlpha* mask, size_t maskRB, |
199 | int w, int h) { |
200 | auto fn = [](const Sk4px& d, const Sk4px& aa) { |
201 | // = (s + d(1-sa))aa + d(1-aa) |
202 | // = s*aa + d(1-sa*aa) |
203 | // ~~~> |
204 | // a = 1*aa + d(1-1*aa) = aa + d(1-aa) |
205 | // c = 0*aa + d(1-1*aa) = d(1-aa) |
206 | return Sk4px(Sk16b(aa) & Sk16b(0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255)) |
207 | + d.approxMulDiv255(aa.inv()); |
208 | }; |
209 | while (h --> 0) { |
210 | Sk4px::MapDstAlpha(w, dst, mask, fn); |
211 | dst += dstRB / sizeof(*dst); |
212 | mask += maskRB / sizeof(*mask); |
213 | } |
214 | } |
215 | #endif |
216 | |
217 | /*not static*/ inline void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, |
218 | const SkAlpha* mask, size_t maskRB, |
219 | SkColor color, int w, int h) { |
220 | if (color == SK_ColorBLACK) { |
221 | blit_mask_d32_a8_black(dst, dstRB, mask, maskRB, w, h); |
222 | } else if (SkColorGetA(color) == 0xFF) { |
223 | blit_mask_d32_a8_opaque(dst, dstRB, mask, maskRB, color, w, h); |
224 | } else { |
225 | blit_mask_d32_a8_general(dst, dstRB, mask, maskRB, color, w, h); |
226 | } |
227 | } |
228 | |
229 | } // namespace SK_OPTS_NS |
230 | |
231 | #endif//SkBlitMask_opts_DEFINED |
232 | |