1/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "include/private/SkColorData.h"
9#include "src/core/SkBlitRow.h"
10#include "src/core/SkOpts.h"
11#include "src/core/SkUtils.h"
12
13// Everyone agrees memcpy() is the best way to do this.
14static void blit_row_s32_opaque(SkPMColor* dst,
15 const SkPMColor* src,
16 int count,
17 U8CPU alpha) {
18 SkASSERT(255 == alpha);
19 memcpy(dst, src, count * sizeof(SkPMColor));
20}
21
22// We have SSE2, NEON, and portable implementations of
23// blit_row_s32_blend() and blit_row_s32a_blend().
24
25// TODO(mtklein): can we do better in NEON than 2 pixels at a time?
26
27#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
28 #include <emmintrin.h>
29
30 static inline __m128i SkPMLerp_SSE2(const __m128i& src,
31 const __m128i& dst,
32 const unsigned src_scale) {
33 // Computes dst + (((src - dst)*src_scale)>>8)
34 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
35
36 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
37 __m128i src_rb = _mm_and_si128(mask, src);
38 __m128i src_ag = _mm_srli_epi16(src, 8);
39 __m128i dst_rb = _mm_and_si128(mask, dst);
40 __m128i dst_ag = _mm_srli_epi16(dst, 8);
41
42 // Compute scaled differences.
43 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
44 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
45 __m128i s = _mm_set1_epi16(src_scale);
46 diff_rb = _mm_mullo_epi16(diff_rb, s);
47 diff_ag = _mm_mullo_epi16(diff_ag, s);
48
49 // Pack the differences back together.
50 diff_rb = _mm_srli_epi16(diff_rb, 8);
51 diff_ag = _mm_andnot_si128(mask, diff_ag);
52 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
53
54 // Add difference to destination.
55 return _mm_add_epi8(dst, diff);
56 }
57
58
59 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
60 SkASSERT(alpha <= 255);
61
62 auto src4 = (const __m128i*)src;
63 auto dst4 = ( __m128i*)dst;
64
65 while (count >= 4) {
66 _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
67 _mm_loadu_si128(dst4),
68 SkAlpha255To256(alpha)));
69 src4++;
70 dst4++;
71 count -= 4;
72 }
73
74 src = (const SkPMColor*)src4;
75 dst = ( SkPMColor*)dst4;
76
77 while (count --> 0) {
78 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
79 src++;
80 dst++;
81 }
82 }
83
84 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
85 const __m128i& dst,
86 const unsigned aa) {
87 unsigned alpha = SkAlpha255To256(aa);
88 __m128i src_scale = _mm_set1_epi16(alpha);
89 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
90 __m128i dst_scale = _mm_srli_epi32(src, 24);
91 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
92 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
93 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
94 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
95 dst_scale = _mm_srli_epi32(dst_scale, 8);
96 // Duplicate scales into 2x16-bit pattern per pixel.
97 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
98 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
99
100 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
101
102 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
103 __m128i src_rb = _mm_and_si128(mask, src);
104 __m128i src_ag = _mm_srli_epi16(src, 8);
105 __m128i dst_rb = _mm_and_si128(mask, dst);
106 __m128i dst_ag = _mm_srli_epi16(dst, 8);
107
108 // Scale them.
109 src_rb = _mm_mullo_epi16(src_rb, src_scale);
110 src_ag = _mm_mullo_epi16(src_ag, src_scale);
111 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
112 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
113
114 // Add the scaled source and destination.
115 dst_rb = _mm_add_epi16(src_rb, dst_rb);
116 dst_ag = _mm_add_epi16(src_ag, dst_ag);
117
118 // Unsplay the halves back together.
119 dst_rb = _mm_srli_epi16(dst_rb, 8);
120 dst_ag = _mm_andnot_si128(mask, dst_ag);
121 return _mm_or_si128(dst_rb, dst_ag);
122 }
123
124 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
125 SkASSERT(alpha <= 255);
126
127 auto src4 = (const __m128i*)src;
128 auto dst4 = ( __m128i*)dst;
129
130 while (count >= 4) {
131 _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
132 _mm_loadu_si128(dst4),
133 alpha));
134 src4++;
135 dst4++;
136 count -= 4;
137 }
138
139 src = (const SkPMColor*)src4;
140 dst = ( SkPMColor*)dst4;
141
142 while (count --> 0) {
143 *dst = SkBlendARGB32(*src, *dst, alpha);
144 src++;
145 dst++;
146 }
147 }
148
149#elif defined(SK_ARM_HAS_NEON)
150 #include <arm_neon.h>
151
152 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
153 SkASSERT(alpha <= 255);
154
155 uint16_t src_scale = SkAlpha255To256(alpha);
156 uint16_t dst_scale = 256 - src_scale;
157
158 while (count >= 2) {
159 uint8x8_t vsrc, vdst, vres;
160 uint16x8_t vsrc_wide, vdst_wide;
161
162 vsrc = vreinterpret_u8_u32(vld1_u32(src));
163 vdst = vreinterpret_u8_u32(vld1_u32(dst));
164
165 vsrc_wide = vmovl_u8(vsrc);
166 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
167
168 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
169
170 vdst_wide += vsrc_wide;
171 vres = vshrn_n_u16(vdst_wide, 8);
172
173 vst1_u32(dst, vreinterpret_u32_u8(vres));
174
175 src += 2;
176 dst += 2;
177 count -= 2;
178 }
179
180 if (count == 1) {
181 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
182 uint16x8_t vsrc_wide, vdst_wide;
183
184 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
185 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
186
187 vsrc_wide = vmovl_u8(vsrc);
188 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
189 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
190 vdst_wide += vsrc_wide;
191 vres = vshrn_n_u16(vdst_wide, 8);
192
193 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
194 }
195 }
196
197 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
198 SkASSERT(alpha < 255);
199
200 unsigned alpha256 = SkAlpha255To256(alpha);
201
202 if (count & 1) {
203 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
204 uint16x8_t vdst_wide, vsrc_wide;
205 unsigned dst_scale;
206
207 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
208 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
209
210 dst_scale = vget_lane_u8(vsrc, 3);
211 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
212
213 vsrc_wide = vmovl_u8(vsrc);
214 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
215
216 vdst_wide = vmovl_u8(vdst);
217 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
218
219 vdst_wide += vsrc_wide;
220 vres = vshrn_n_u16(vdst_wide, 8);
221
222 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
223 dst++;
224 src++;
225 count--;
226 }
227
228 uint8x8_t alpha_mask;
229 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
230 alpha_mask = vld1_u8(alpha_mask_setup);
231
232 while (count) {
233
234 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
235 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
236
237 __builtin_prefetch(src+32);
238 __builtin_prefetch(dst+32);
239
240 vsrc = vreinterpret_u8_u32(vld1_u32(src));
241 vdst = vreinterpret_u8_u32(vld1_u32(dst));
242
243 vsrc_scale = vdupq_n_u16(alpha256);
244
245 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
246 vdst_scale = vmovl_u8(vsrc_alphas);
247 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
248 // A 16-bit lane would overflow if we used 0xFFFF here,
249 // so use an approximation with 0xFF00 that is off by 1,
250 // and add back 1 after to get the correct value.
251 // This is valid if alpha256 <= 255.
252 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
253 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
254 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
255
256 vsrc_wide = vmovl_u8(vsrc);
257 vsrc_wide *= vsrc_scale;
258
259 vdst_wide = vmovl_u8(vdst);
260 vdst_wide *= vdst_scale;
261
262 vdst_wide += vsrc_wide;
263 vres = vshrn_n_u16(vdst_wide, 8);
264
265 vst1_u32(dst, vreinterpret_u32_u8(vres));
266
267 src += 2;
268 dst += 2;
269 count -= 2;
270 }
271 }
272
273#else
274 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
275 SkASSERT(alpha <= 255);
276 while (count --> 0) {
277 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
278 src++;
279 dst++;
280 }
281 }
282
283 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
284 SkASSERT(alpha <= 255);
285 while (count --> 0) {
286 *dst = SkBlendARGB32(*src, *dst, alpha);
287 src++;
288 dst++;
289 }
290 }
291#endif
292
293SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
294 static const SkBlitRow::Proc32 kProcs[] = {
295 blit_row_s32_opaque,
296 blit_row_s32_blend,
297 nullptr, // blit_row_s32a_opaque is in SkOpts
298 blit_row_s32a_blend
299 };
300
301 SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
302 flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe
303
304 return flags == 2 ? SkOpts::blit_row_s32a_opaque
305 : kProcs[flags];
306}
307
308void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
309 switch (SkGetPackedA32(color)) {
310 case 0: memmove(dst, src, count * sizeof(SkPMColor)); return;
311 case 255: sk_memset32(dst, color, count); return;
312 }
313 return SkOpts::blit_row_color32(dst, src, count, color);
314}
315