| 1 | /* | 
|---|
| 2 | * Copyright 2011 Google Inc. | 
|---|
| 3 | * | 
|---|
| 4 | * Use of this source code is governed by a BSD-style license that can be | 
|---|
| 5 | * found in the LICENSE file. | 
|---|
| 6 | */ | 
|---|
| 7 |  | 
|---|
| 8 | #include "include/private/SkColorData.h" | 
|---|
| 9 | #include "src/core/SkBlitRow.h" | 
|---|
| 10 | #include "src/core/SkOpts.h" | 
|---|
| 11 | #include "src/core/SkUtils.h" | 
|---|
| 12 |  | 
|---|
| 13 | // Everyone agrees memcpy() is the best way to do this. | 
|---|
| 14 | static void blit_row_s32_opaque(SkPMColor* dst, | 
|---|
| 15 | const SkPMColor* src, | 
|---|
| 16 | int count, | 
|---|
| 17 | U8CPU alpha) { | 
|---|
| 18 | SkASSERT(255 == alpha); | 
|---|
| 19 | memcpy(dst, src, count * sizeof(SkPMColor)); | 
|---|
| 20 | } | 
|---|
| 21 |  | 
|---|
| 22 | // We have SSE2, NEON, and portable implementations of | 
|---|
| 23 | // blit_row_s32_blend() and blit_row_s32a_blend(). | 
|---|
| 24 |  | 
|---|
| 25 | // TODO(mtklein): can we do better in NEON than 2 pixels at a time? | 
|---|
| 26 |  | 
|---|
| 27 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 
|---|
| 28 | #include <emmintrin.h> | 
|---|
| 29 |  | 
|---|
| 30 | static inline __m128i SkPMLerp_SSE2(const __m128i& src, | 
|---|
| 31 | const __m128i& dst, | 
|---|
| 32 | const unsigned src_scale) { | 
|---|
| 33 | // Computes dst + (((src - dst)*src_scale)>>8) | 
|---|
| 34 | const __m128i mask = _mm_set1_epi32(0x00FF00FF); | 
|---|
| 35 |  | 
|---|
| 36 | // Unpack the 16x8-bit source into 2 8x16-bit splayed halves. | 
|---|
| 37 | __m128i src_rb = _mm_and_si128(mask, src); | 
|---|
| 38 | __m128i src_ag = _mm_srli_epi16(src, 8); | 
|---|
| 39 | __m128i dst_rb = _mm_and_si128(mask, dst); | 
|---|
| 40 | __m128i dst_ag = _mm_srli_epi16(dst, 8); | 
|---|
| 41 |  | 
|---|
| 42 | // Compute scaled differences. | 
|---|
| 43 | __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb); | 
|---|
| 44 | __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag); | 
|---|
| 45 | __m128i s = _mm_set1_epi16(src_scale); | 
|---|
| 46 | diff_rb = _mm_mullo_epi16(diff_rb, s); | 
|---|
| 47 | diff_ag = _mm_mullo_epi16(diff_ag, s); | 
|---|
| 48 |  | 
|---|
| 49 | // Pack the differences back together. | 
|---|
| 50 | diff_rb = _mm_srli_epi16(diff_rb, 8); | 
|---|
| 51 | diff_ag = _mm_andnot_si128(mask, diff_ag); | 
|---|
| 52 | __m128i diff = _mm_or_si128(diff_rb, diff_ag); | 
|---|
| 53 |  | 
|---|
| 54 | // Add difference to destination. | 
|---|
| 55 | return _mm_add_epi8(dst, diff); | 
|---|
| 56 | } | 
|---|
| 57 |  | 
|---|
| 58 |  | 
|---|
| 59 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 60 | SkASSERT(alpha <= 255); | 
|---|
| 61 |  | 
|---|
| 62 | auto src4 = (const __m128i*)src; | 
|---|
| 63 | auto dst4 = (      __m128i*)dst; | 
|---|
| 64 |  | 
|---|
| 65 | while (count >= 4) { | 
|---|
| 66 | _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4), | 
|---|
| 67 | _mm_loadu_si128(dst4), | 
|---|
| 68 | SkAlpha255To256(alpha))); | 
|---|
| 69 | src4++; | 
|---|
| 70 | dst4++; | 
|---|
| 71 | count -= 4; | 
|---|
| 72 | } | 
|---|
| 73 |  | 
|---|
| 74 | src = (const SkPMColor*)src4; | 
|---|
| 75 | dst = (      SkPMColor*)dst4; | 
|---|
| 76 |  | 
|---|
| 77 | while (count --> 0) { | 
|---|
| 78 | *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); | 
|---|
| 79 | src++; | 
|---|
| 80 | dst++; | 
|---|
| 81 | } | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, | 
|---|
| 85 | const __m128i& dst, | 
|---|
| 86 | const unsigned aa) { | 
|---|
| 87 | unsigned alpha = SkAlpha255To256(aa); | 
|---|
| 88 | __m128i src_scale = _mm_set1_epi16(alpha); | 
|---|
| 89 | // SkAlphaMulInv256(SkGetPackedA32(src), src_scale) | 
|---|
| 90 | __m128i dst_scale = _mm_srli_epi32(src, 24); | 
|---|
| 91 | // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale. | 
|---|
| 92 | dst_scale = _mm_mullo_epi16(dst_scale, src_scale); | 
|---|
| 93 | dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale); | 
|---|
| 94 | dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8)); | 
|---|
| 95 | dst_scale = _mm_srli_epi32(dst_scale, 8); | 
|---|
| 96 | // Duplicate scales into 2x16-bit pattern per pixel. | 
|---|
| 97 | dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); | 
|---|
| 98 | dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); | 
|---|
| 99 |  | 
|---|
| 100 | const __m128i mask = _mm_set1_epi32(0x00FF00FF); | 
|---|
| 101 |  | 
|---|
| 102 | // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves. | 
|---|
| 103 | __m128i src_rb = _mm_and_si128(mask, src); | 
|---|
| 104 | __m128i src_ag = _mm_srli_epi16(src, 8); | 
|---|
| 105 | __m128i dst_rb = _mm_and_si128(mask, dst); | 
|---|
| 106 | __m128i dst_ag = _mm_srli_epi16(dst, 8); | 
|---|
| 107 |  | 
|---|
| 108 | // Scale them. | 
|---|
| 109 | src_rb = _mm_mullo_epi16(src_rb, src_scale); | 
|---|
| 110 | src_ag = _mm_mullo_epi16(src_ag, src_scale); | 
|---|
| 111 | dst_rb = _mm_mullo_epi16(dst_rb, dst_scale); | 
|---|
| 112 | dst_ag = _mm_mullo_epi16(dst_ag, dst_scale); | 
|---|
| 113 |  | 
|---|
| 114 | // Add the scaled source and destination. | 
|---|
| 115 | dst_rb = _mm_add_epi16(src_rb, dst_rb); | 
|---|
| 116 | dst_ag = _mm_add_epi16(src_ag, dst_ag); | 
|---|
| 117 |  | 
|---|
| 118 | // Unsplay the halves back together. | 
|---|
| 119 | dst_rb = _mm_srli_epi16(dst_rb, 8); | 
|---|
| 120 | dst_ag = _mm_andnot_si128(mask, dst_ag); | 
|---|
| 121 | return _mm_or_si128(dst_rb, dst_ag); | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 125 | SkASSERT(alpha <= 255); | 
|---|
| 126 |  | 
|---|
| 127 | auto src4 = (const __m128i*)src; | 
|---|
| 128 | auto dst4 = (      __m128i*)dst; | 
|---|
| 129 |  | 
|---|
| 130 | while (count >= 4) { | 
|---|
| 131 | _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4), | 
|---|
| 132 | _mm_loadu_si128(dst4), | 
|---|
| 133 | alpha)); | 
|---|
| 134 | src4++; | 
|---|
| 135 | dst4++; | 
|---|
| 136 | count -= 4; | 
|---|
| 137 | } | 
|---|
| 138 |  | 
|---|
| 139 | src = (const SkPMColor*)src4; | 
|---|
| 140 | dst = (      SkPMColor*)dst4; | 
|---|
| 141 |  | 
|---|
| 142 | while (count --> 0) { | 
|---|
| 143 | *dst = SkBlendARGB32(*src, *dst, alpha); | 
|---|
| 144 | src++; | 
|---|
| 145 | dst++; | 
|---|
| 146 | } | 
|---|
| 147 | } | 
|---|
| 148 |  | 
|---|
| 149 | #elif defined(SK_ARM_HAS_NEON) | 
|---|
| 150 | #include <arm_neon.h> | 
|---|
| 151 |  | 
|---|
| 152 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 153 | SkASSERT(alpha <= 255); | 
|---|
| 154 |  | 
|---|
| 155 | uint16_t src_scale = SkAlpha255To256(alpha); | 
|---|
| 156 | uint16_t dst_scale = 256 - src_scale; | 
|---|
| 157 |  | 
|---|
| 158 | while (count >= 2) { | 
|---|
| 159 | uint8x8_t vsrc, vdst, vres; | 
|---|
| 160 | uint16x8_t vsrc_wide, vdst_wide; | 
|---|
| 161 |  | 
|---|
| 162 | vsrc = vreinterpret_u8_u32(vld1_u32(src)); | 
|---|
| 163 | vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 
|---|
| 164 |  | 
|---|
| 165 | vsrc_wide = vmovl_u8(vsrc); | 
|---|
| 166 | vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 
|---|
| 167 |  | 
|---|
| 168 | vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 
|---|
| 169 |  | 
|---|
| 170 | vdst_wide += vsrc_wide; | 
|---|
| 171 | vres = vshrn_n_u16(vdst_wide, 8); | 
|---|
| 172 |  | 
|---|
| 173 | vst1_u32(dst, vreinterpret_u32_u8(vres)); | 
|---|
| 174 |  | 
|---|
| 175 | src += 2; | 
|---|
| 176 | dst += 2; | 
|---|
| 177 | count -= 2; | 
|---|
| 178 | } | 
|---|
| 179 |  | 
|---|
| 180 | if (count == 1) { | 
|---|
| 181 | uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; | 
|---|
| 182 | uint16x8_t vsrc_wide, vdst_wide; | 
|---|
| 183 |  | 
|---|
| 184 | vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); | 
|---|
| 185 | vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); | 
|---|
| 186 |  | 
|---|
| 187 | vsrc_wide = vmovl_u8(vsrc); | 
|---|
| 188 | vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 
|---|
| 189 | vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 
|---|
| 190 | vdst_wide += vsrc_wide; | 
|---|
| 191 | vres = vshrn_n_u16(vdst_wide, 8); | 
|---|
| 192 |  | 
|---|
| 193 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 
|---|
| 194 | } | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 198 | SkASSERT(alpha < 255); | 
|---|
| 199 |  | 
|---|
| 200 | unsigned alpha256 = SkAlpha255To256(alpha); | 
|---|
| 201 |  | 
|---|
| 202 | if (count & 1) { | 
|---|
| 203 | uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; | 
|---|
| 204 | uint16x8_t vdst_wide, vsrc_wide; | 
|---|
| 205 | unsigned dst_scale; | 
|---|
| 206 |  | 
|---|
| 207 | vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); | 
|---|
| 208 | vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); | 
|---|
| 209 |  | 
|---|
| 210 | dst_scale = vget_lane_u8(vsrc, 3); | 
|---|
| 211 | dst_scale = SkAlphaMulInv256(dst_scale, alpha256); | 
|---|
| 212 |  | 
|---|
| 213 | vsrc_wide = vmovl_u8(vsrc); | 
|---|
| 214 | vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); | 
|---|
| 215 |  | 
|---|
| 216 | vdst_wide = vmovl_u8(vdst); | 
|---|
| 217 | vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); | 
|---|
| 218 |  | 
|---|
| 219 | vdst_wide += vsrc_wide; | 
|---|
| 220 | vres = vshrn_n_u16(vdst_wide, 8); | 
|---|
| 221 |  | 
|---|
| 222 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 
|---|
| 223 | dst++; | 
|---|
| 224 | src++; | 
|---|
| 225 | count--; | 
|---|
| 226 | } | 
|---|
| 227 |  | 
|---|
| 228 | uint8x8_t alpha_mask; | 
|---|
| 229 | static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 
|---|
| 230 | alpha_mask = vld1_u8(alpha_mask_setup); | 
|---|
| 231 |  | 
|---|
| 232 | while (count) { | 
|---|
| 233 |  | 
|---|
| 234 | uint8x8_t vsrc, vdst, vres, vsrc_alphas; | 
|---|
| 235 | uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; | 
|---|
| 236 |  | 
|---|
| 237 | __builtin_prefetch(src+32); | 
|---|
| 238 | __builtin_prefetch(dst+32); | 
|---|
| 239 |  | 
|---|
| 240 | vsrc = vreinterpret_u8_u32(vld1_u32(src)); | 
|---|
| 241 | vdst = vreinterpret_u8_u32(vld1_u32(dst)); | 
|---|
| 242 |  | 
|---|
| 243 | vsrc_scale = vdupq_n_u16(alpha256); | 
|---|
| 244 |  | 
|---|
| 245 | vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); | 
|---|
| 246 | vdst_scale = vmovl_u8(vsrc_alphas); | 
|---|
| 247 | // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). | 
|---|
| 248 | // A 16-bit lane would overflow if we used 0xFFFF here, | 
|---|
| 249 | // so use an approximation with 0xFF00 that is off by 1, | 
|---|
| 250 | // and add back 1 after to get the correct value. | 
|---|
| 251 | // This is valid if alpha256 <= 255. | 
|---|
| 252 | vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); | 
|---|
| 253 | vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); | 
|---|
| 254 | vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); | 
|---|
| 255 |  | 
|---|
| 256 | vsrc_wide = vmovl_u8(vsrc); | 
|---|
| 257 | vsrc_wide *= vsrc_scale; | 
|---|
| 258 |  | 
|---|
| 259 | vdst_wide = vmovl_u8(vdst); | 
|---|
| 260 | vdst_wide *= vdst_scale; | 
|---|
| 261 |  | 
|---|
| 262 | vdst_wide += vsrc_wide; | 
|---|
| 263 | vres = vshrn_n_u16(vdst_wide, 8); | 
|---|
| 264 |  | 
|---|
| 265 | vst1_u32(dst, vreinterpret_u32_u8(vres)); | 
|---|
| 266 |  | 
|---|
| 267 | src += 2; | 
|---|
| 268 | dst += 2; | 
|---|
| 269 | count -= 2; | 
|---|
| 270 | } | 
|---|
| 271 | } | 
|---|
| 272 |  | 
|---|
| 273 | #else | 
|---|
| 274 | static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 275 | SkASSERT(alpha <= 255); | 
|---|
| 276 | while (count --> 0) { | 
|---|
| 277 | *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); | 
|---|
| 278 | src++; | 
|---|
| 279 | dst++; | 
|---|
| 280 | } | 
|---|
| 281 | } | 
|---|
| 282 |  | 
|---|
| 283 | static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { | 
|---|
| 284 | SkASSERT(alpha <= 255); | 
|---|
| 285 | while (count --> 0) { | 
|---|
| 286 | *dst = SkBlendARGB32(*src, *dst, alpha); | 
|---|
| 287 | src++; | 
|---|
| 288 | dst++; | 
|---|
| 289 | } | 
|---|
| 290 | } | 
|---|
| 291 | #endif | 
|---|
| 292 |  | 
|---|
| 293 | SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { | 
|---|
| 294 | static const SkBlitRow::Proc32 kProcs[] = { | 
|---|
| 295 | blit_row_s32_opaque, | 
|---|
| 296 | blit_row_s32_blend, | 
|---|
| 297 | nullptr,  // blit_row_s32a_opaque is in SkOpts | 
|---|
| 298 | blit_row_s32a_blend | 
|---|
| 299 | }; | 
|---|
| 300 |  | 
|---|
| 301 | SkASSERT(flags < SK_ARRAY_COUNT(kProcs)); | 
|---|
| 302 | flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe | 
|---|
| 303 |  | 
|---|
| 304 | return flags == 2 ? SkOpts::blit_row_s32a_opaque | 
|---|
| 305 | : kProcs[flags]; | 
|---|
| 306 | } | 
|---|
| 307 |  | 
|---|
| 308 | void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { | 
|---|
| 309 | switch (SkGetPackedA32(color)) { | 
|---|
| 310 | case   0: memmove(dst, src, count * sizeof(SkPMColor)); return; | 
|---|
| 311 | case 255: sk_memset32(dst, color, count);               return; | 
|---|
| 312 | } | 
|---|
| 313 | return SkOpts::blit_row_color32(dst, src, count, color); | 
|---|
| 314 | } | 
|---|
| 315 |  | 
|---|