1 | /* |
2 | * Copyright 2011 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #include "include/private/SkColorData.h" |
9 | #include "src/core/SkBlitRow.h" |
10 | #include "src/core/SkOpts.h" |
11 | #include "src/core/SkUtils.h" |
12 | |
13 | // Everyone agrees memcpy() is the best way to do this. |
14 | static void blit_row_s32_opaque(SkPMColor* dst, |
15 | const SkPMColor* src, |
16 | int count, |
17 | U8CPU alpha) { |
18 | SkASSERT(255 == alpha); |
19 | memcpy(dst, src, count * sizeof(SkPMColor)); |
20 | } |
21 | |
22 | // We have SSE2, NEON, and portable implementations of |
23 | // blit_row_s32_blend() and blit_row_s32a_blend(). |
24 | |
25 | // TODO(mtklein): can we do better in NEON than 2 pixels at a time? |
26 | |
27 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
28 | #include <emmintrin.h> |
29 | |
30 | static inline __m128i SkPMLerp_SSE2(const __m128i& src, |
31 | const __m128i& dst, |
32 | const unsigned src_scale) { |
33 | // Computes dst + (((src - dst)*src_scale)>>8) |
34 | const __m128i mask = _mm_set1_epi32(0x00FF00FF); |
35 | |
36 | // Unpack the 16x8-bit source into 2 8x16-bit splayed halves. |
37 | __m128i src_rb = _mm_and_si128(mask, src); |
38 | __m128i src_ag = _mm_srli_epi16(src, 8); |
39 | __m128i dst_rb = _mm_and_si128(mask, dst); |
40 | __m128i dst_ag = _mm_srli_epi16(dst, 8); |
41 | |
42 | // Compute scaled differences. |
43 | __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb); |
44 | __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag); |
45 | __m128i s = _mm_set1_epi16(src_scale); |
46 | diff_rb = _mm_mullo_epi16(diff_rb, s); |
47 | diff_ag = _mm_mullo_epi16(diff_ag, s); |
48 | |
49 | // Pack the differences back together. |
50 | diff_rb = _mm_srli_epi16(diff_rb, 8); |
51 | diff_ag = _mm_andnot_si128(mask, diff_ag); |
52 | __m128i diff = _mm_or_si128(diff_rb, diff_ag); |
53 | |
54 | // Add difference to destination. |
55 | return _mm_add_epi8(dst, diff); |
56 | } |
57 | |
58 | |
59 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
60 | SkASSERT(alpha <= 255); |
61 | |
62 | auto src4 = (const __m128i*)src; |
63 | auto dst4 = ( __m128i*)dst; |
64 | |
65 | while (count >= 4) { |
66 | _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4), |
67 | _mm_loadu_si128(dst4), |
68 | SkAlpha255To256(alpha))); |
69 | src4++; |
70 | dst4++; |
71 | count -= 4; |
72 | } |
73 | |
74 | src = (const SkPMColor*)src4; |
75 | dst = ( SkPMColor*)dst4; |
76 | |
77 | while (count --> 0) { |
78 | *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); |
79 | src++; |
80 | dst++; |
81 | } |
82 | } |
83 | |
84 | static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, |
85 | const __m128i& dst, |
86 | const unsigned aa) { |
87 | unsigned alpha = SkAlpha255To256(aa); |
88 | __m128i src_scale = _mm_set1_epi16(alpha); |
89 | // SkAlphaMulInv256(SkGetPackedA32(src), src_scale) |
90 | __m128i dst_scale = _mm_srli_epi32(src, 24); |
91 | // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale. |
92 | dst_scale = _mm_mullo_epi16(dst_scale, src_scale); |
93 | dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale); |
94 | dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8)); |
95 | dst_scale = _mm_srli_epi32(dst_scale, 8); |
96 | // Duplicate scales into 2x16-bit pattern per pixel. |
97 | dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); |
98 | dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); |
99 | |
100 | const __m128i mask = _mm_set1_epi32(0x00FF00FF); |
101 | |
102 | // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves. |
103 | __m128i src_rb = _mm_and_si128(mask, src); |
104 | __m128i src_ag = _mm_srli_epi16(src, 8); |
105 | __m128i dst_rb = _mm_and_si128(mask, dst); |
106 | __m128i dst_ag = _mm_srli_epi16(dst, 8); |
107 | |
108 | // Scale them. |
109 | src_rb = _mm_mullo_epi16(src_rb, src_scale); |
110 | src_ag = _mm_mullo_epi16(src_ag, src_scale); |
111 | dst_rb = _mm_mullo_epi16(dst_rb, dst_scale); |
112 | dst_ag = _mm_mullo_epi16(dst_ag, dst_scale); |
113 | |
114 | // Add the scaled source and destination. |
115 | dst_rb = _mm_add_epi16(src_rb, dst_rb); |
116 | dst_ag = _mm_add_epi16(src_ag, dst_ag); |
117 | |
118 | // Unsplay the halves back together. |
119 | dst_rb = _mm_srli_epi16(dst_rb, 8); |
120 | dst_ag = _mm_andnot_si128(mask, dst_ag); |
121 | return _mm_or_si128(dst_rb, dst_ag); |
122 | } |
123 | |
124 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
125 | SkASSERT(alpha <= 255); |
126 | |
127 | auto src4 = (const __m128i*)src; |
128 | auto dst4 = ( __m128i*)dst; |
129 | |
130 | while (count >= 4) { |
131 | _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4), |
132 | _mm_loadu_si128(dst4), |
133 | alpha)); |
134 | src4++; |
135 | dst4++; |
136 | count -= 4; |
137 | } |
138 | |
139 | src = (const SkPMColor*)src4; |
140 | dst = ( SkPMColor*)dst4; |
141 | |
142 | while (count --> 0) { |
143 | *dst = SkBlendARGB32(*src, *dst, alpha); |
144 | src++; |
145 | dst++; |
146 | } |
147 | } |
148 | |
149 | #elif defined(SK_ARM_HAS_NEON) |
150 | #include <arm_neon.h> |
151 | |
152 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
153 | SkASSERT(alpha <= 255); |
154 | |
155 | uint16_t src_scale = SkAlpha255To256(alpha); |
156 | uint16_t dst_scale = 256 - src_scale; |
157 | |
158 | while (count >= 2) { |
159 | uint8x8_t vsrc, vdst, vres; |
160 | uint16x8_t vsrc_wide, vdst_wide; |
161 | |
162 | vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
163 | vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
164 | |
165 | vsrc_wide = vmovl_u8(vsrc); |
166 | vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
167 | |
168 | vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
169 | |
170 | vdst_wide += vsrc_wide; |
171 | vres = vshrn_n_u16(vdst_wide, 8); |
172 | |
173 | vst1_u32(dst, vreinterpret_u32_u8(vres)); |
174 | |
175 | src += 2; |
176 | dst += 2; |
177 | count -= 2; |
178 | } |
179 | |
180 | if (count == 1) { |
181 | uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
182 | uint16x8_t vsrc_wide, vdst_wide; |
183 | |
184 | vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); |
185 | vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); |
186 | |
187 | vsrc_wide = vmovl_u8(vsrc); |
188 | vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
189 | vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
190 | vdst_wide += vsrc_wide; |
191 | vres = vshrn_n_u16(vdst_wide, 8); |
192 | |
193 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
194 | } |
195 | } |
196 | |
197 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
198 | SkASSERT(alpha < 255); |
199 | |
200 | unsigned alpha256 = SkAlpha255To256(alpha); |
201 | |
202 | if (count & 1) { |
203 | uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
204 | uint16x8_t vdst_wide, vsrc_wide; |
205 | unsigned dst_scale; |
206 | |
207 | vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); |
208 | vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); |
209 | |
210 | dst_scale = vget_lane_u8(vsrc, 3); |
211 | dst_scale = SkAlphaMulInv256(dst_scale, alpha256); |
212 | |
213 | vsrc_wide = vmovl_u8(vsrc); |
214 | vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
215 | |
216 | vdst_wide = vmovl_u8(vdst); |
217 | vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
218 | |
219 | vdst_wide += vsrc_wide; |
220 | vres = vshrn_n_u16(vdst_wide, 8); |
221 | |
222 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
223 | dst++; |
224 | src++; |
225 | count--; |
226 | } |
227 | |
228 | uint8x8_t alpha_mask; |
229 | static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
230 | alpha_mask = vld1_u8(alpha_mask_setup); |
231 | |
232 | while (count) { |
233 | |
234 | uint8x8_t vsrc, vdst, vres, vsrc_alphas; |
235 | uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; |
236 | |
237 | __builtin_prefetch(src+32); |
238 | __builtin_prefetch(dst+32); |
239 | |
240 | vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
241 | vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
242 | |
243 | vsrc_scale = vdupq_n_u16(alpha256); |
244 | |
245 | vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
246 | vdst_scale = vmovl_u8(vsrc_alphas); |
247 | // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). |
248 | // A 16-bit lane would overflow if we used 0xFFFF here, |
249 | // so use an approximation with 0xFF00 that is off by 1, |
250 | // and add back 1 after to get the correct value. |
251 | // This is valid if alpha256 <= 255. |
252 | vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); |
253 | vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); |
254 | vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); |
255 | |
256 | vsrc_wide = vmovl_u8(vsrc); |
257 | vsrc_wide *= vsrc_scale; |
258 | |
259 | vdst_wide = vmovl_u8(vdst); |
260 | vdst_wide *= vdst_scale; |
261 | |
262 | vdst_wide += vsrc_wide; |
263 | vres = vshrn_n_u16(vdst_wide, 8); |
264 | |
265 | vst1_u32(dst, vreinterpret_u32_u8(vres)); |
266 | |
267 | src += 2; |
268 | dst += 2; |
269 | count -= 2; |
270 | } |
271 | } |
272 | |
273 | #else |
274 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
275 | SkASSERT(alpha <= 255); |
276 | while (count --> 0) { |
277 | *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); |
278 | src++; |
279 | dst++; |
280 | } |
281 | } |
282 | |
283 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { |
284 | SkASSERT(alpha <= 255); |
285 | while (count --> 0) { |
286 | *dst = SkBlendARGB32(*src, *dst, alpha); |
287 | src++; |
288 | dst++; |
289 | } |
290 | } |
291 | #endif |
292 | |
293 | SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { |
294 | static const SkBlitRow::Proc32 kProcs[] = { |
295 | blit_row_s32_opaque, |
296 | blit_row_s32_blend, |
297 | nullptr, // blit_row_s32a_opaque is in SkOpts |
298 | blit_row_s32a_blend |
299 | }; |
300 | |
301 | SkASSERT(flags < SK_ARRAY_COUNT(kProcs)); |
302 | flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe |
303 | |
304 | return flags == 2 ? SkOpts::blit_row_s32a_opaque |
305 | : kProcs[flags]; |
306 | } |
307 | |
308 | void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { |
309 | switch (SkGetPackedA32(color)) { |
310 | case 0: memmove(dst, src, count * sizeof(SkPMColor)); return; |
311 | case 255: sk_memset32(dst, color, count); return; |
312 | } |
313 | return SkOpts::blit_row_color32(dst, src, count, color); |
314 | } |
315 | |