| 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "src/core/SkMaskBlurFilter.h" |
| 9 | |
| 10 | #include "include/core/SkColorPriv.h" |
| 11 | #include "include/private/SkMalloc.h" |
| 12 | #include "include/private/SkNx.h" |
| 13 | #include "include/private/SkTemplates.h" |
| 14 | #include "include/private/SkTo.h" |
| 15 | #include "src/core/SkArenaAlloc.h" |
| 16 | #include "src/core/SkGaussFilter.h" |
| 17 | |
| 18 | #include <cmath> |
| 19 | #include <climits> |
| 20 | |
| 21 | namespace { |
| 22 | static const double kPi = 3.14159265358979323846264338327950288; |
| 23 | |
| 24 | class PlanGauss final { |
| 25 | public: |
| 26 | explicit PlanGauss(double sigma) { |
| 27 | auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)); |
| 28 | auto window = std::max(1, possibleWindow); |
| 29 | |
| 30 | fPass0Size = window - 1; |
| 31 | fPass1Size = window - 1; |
| 32 | fPass2Size = (window & 1) == 1 ? window - 1 : window; |
| 33 | |
| 34 | // Calculating the border is tricky. I will go through the odd case which is simpler, and |
| 35 | // then through the even case. Given a stack of filters seven wide for the odd case of |
| 36 | // three passes. |
| 37 | // |
| 38 | // S |
| 39 | // aaaAaaa |
| 40 | // bbbBbbb |
| 41 | // cccCccc |
| 42 | // D |
| 43 | // |
| 44 | // The furthest changed pixel is when the filters are in the following configuration. |
| 45 | // |
| 46 | // S |
| 47 | // aaaAaaa |
| 48 | // bbbBbbb |
| 49 | // cccCccc |
| 50 | // D |
| 51 | // |
| 52 | // The A pixel is calculated using the value S, the B uses A, and the C uses B, and |
| 53 | // finally D is C. So, with a window size of seven the border is nine. In general, the |
| 54 | // border is 3*((window - 1)/2). |
| 55 | // |
| 56 | // For even cases the filter stack is more complicated. The spec specifies two passes |
| 57 | // of even filters and a final pass of odd filters. A stack for a width of six looks like |
| 58 | // this. |
| 59 | // |
| 60 | // S |
| 61 | // aaaAaa |
| 62 | // bbBbbb |
| 63 | // cccCccc |
| 64 | // D |
| 65 | // |
| 66 | // The furthest pixel looks like this. |
| 67 | // |
| 68 | // S |
| 69 | // aaaAaa |
| 70 | // bbBbbb |
| 71 | // cccCccc |
| 72 | // D |
| 73 | // |
| 74 | // For a window of size, the border value is seven. In general the border is 3 * |
| 75 | // (window/2) -1. |
| 76 | fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; |
| 77 | fSlidingWindow = 2 * fBorder + 1; |
| 78 | |
| 79 | // If the window is odd then the divisor is just window ^ 3 otherwise, |
| 80 | // it is window * window * (window + 1) = window ^ 2 + window ^ 3; |
| 81 | auto window2 = window * window; |
| 82 | auto window3 = window2 * window; |
| 83 | auto divisor = (window & 1) == 1 ? window3 : window3 + window2; |
| 84 | |
| 85 | fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32))); |
| 86 | } |
| 87 | |
| 88 | size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; } |
| 89 | |
| 90 | int border() const { return fBorder; } |
| 91 | |
| 92 | public: |
| 93 | class Scan { |
| 94 | public: |
| 95 | Scan(uint64_t weight, int noChangeCount, |
| 96 | uint32_t* buffer0, uint32_t* buffer0End, |
| 97 | uint32_t* buffer1, uint32_t* buffer1End, |
| 98 | uint32_t* buffer2, uint32_t* buffer2End) |
| 99 | : fWeight{weight} |
| 100 | , fNoChangeCount{noChangeCount} |
| 101 | , fBuffer0{buffer0} |
| 102 | , fBuffer0End{buffer0End} |
| 103 | , fBuffer1{buffer1} |
| 104 | , fBuffer1End{buffer1End} |
| 105 | , fBuffer2{buffer2} |
| 106 | , fBuffer2End{buffer2End} |
| 107 | { } |
| 108 | |
| 109 | template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd, |
| 110 | uint8_t* dst, int dstStride, uint8_t* dstEnd) const { |
| 111 | auto buffer0Cursor = fBuffer0; |
| 112 | auto buffer1Cursor = fBuffer1; |
| 113 | auto buffer2Cursor = fBuffer2; |
| 114 | |
| 115 | std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); |
| 116 | |
| 117 | uint32_t sum0 = 0; |
| 118 | uint32_t sum1 = 0; |
| 119 | uint32_t sum2 = 0; |
| 120 | |
| 121 | // Consume the source generating pixels. |
| 122 | for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) { |
| 123 | uint32_t leadingEdge = *src; |
| 124 | sum0 += leadingEdge; |
| 125 | sum1 += sum0; |
| 126 | sum2 += sum1; |
| 127 | |
| 128 | *dst = this->finalScale(sum2); |
| 129 | |
| 130 | sum2 -= *buffer2Cursor; |
| 131 | *buffer2Cursor = sum1; |
| 132 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
| 133 | |
| 134 | sum1 -= *buffer1Cursor; |
| 135 | *buffer1Cursor = sum0; |
| 136 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
| 137 | |
| 138 | sum0 -= *buffer0Cursor; |
| 139 | *buffer0Cursor = leadingEdge; |
| 140 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
| 141 | } |
| 142 | |
| 143 | // The leading edge is off the right side of the mask. |
| 144 | for (int i = 0; i < fNoChangeCount; i++) { |
| 145 | uint32_t leadingEdge = 0; |
| 146 | sum0 += leadingEdge; |
| 147 | sum1 += sum0; |
| 148 | sum2 += sum1; |
| 149 | |
| 150 | *dst = this->finalScale(sum2); |
| 151 | |
| 152 | sum2 -= *buffer2Cursor; |
| 153 | *buffer2Cursor = sum1; |
| 154 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
| 155 | |
| 156 | sum1 -= *buffer1Cursor; |
| 157 | *buffer1Cursor = sum0; |
| 158 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
| 159 | |
| 160 | sum0 -= *buffer0Cursor; |
| 161 | *buffer0Cursor = leadingEdge; |
| 162 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
| 163 | |
| 164 | dst += dstStride; |
| 165 | } |
| 166 | |
| 167 | // Starting from the right, fill in the rest of the buffer. |
| 168 | std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); |
| 169 | |
| 170 | sum0 = sum1 = sum2 = 0; |
| 171 | |
| 172 | uint8_t* dstCursor = dstEnd; |
| 173 | AlphaIter src = srcEnd; |
| 174 | while (dstCursor > dst) { |
| 175 | dstCursor -= dstStride; |
| 176 | uint32_t leadingEdge = *(--src); |
| 177 | sum0 += leadingEdge; |
| 178 | sum1 += sum0; |
| 179 | sum2 += sum1; |
| 180 | |
| 181 | *dstCursor = this->finalScale(sum2); |
| 182 | |
| 183 | sum2 -= *buffer2Cursor; |
| 184 | *buffer2Cursor = sum1; |
| 185 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
| 186 | |
| 187 | sum1 -= *buffer1Cursor; |
| 188 | *buffer1Cursor = sum0; |
| 189 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
| 190 | |
| 191 | sum0 -= *buffer0Cursor; |
| 192 | *buffer0Cursor = leadingEdge; |
| 193 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | private: |
| 198 | static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; |
| 199 | |
| 200 | uint8_t finalScale(uint32_t sum) const { |
| 201 | return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32); |
| 202 | } |
| 203 | |
| 204 | uint64_t fWeight; |
| 205 | int fNoChangeCount; |
| 206 | uint32_t* fBuffer0; |
| 207 | uint32_t* fBuffer0End; |
| 208 | uint32_t* fBuffer1; |
| 209 | uint32_t* fBuffer1End; |
| 210 | uint32_t* fBuffer2; |
| 211 | uint32_t* fBuffer2End; |
| 212 | }; |
| 213 | |
| 214 | Scan makeBlurScan(int width, uint32_t* buffer) const { |
| 215 | uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End; |
| 216 | buffer0 = buffer; |
| 217 | buffer0End = buffer1 = buffer0 + fPass0Size; |
| 218 | buffer1End = buffer2 = buffer1 + fPass1Size; |
| 219 | buffer2End = buffer2 + fPass2Size; |
| 220 | int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0; |
| 221 | |
| 222 | return Scan( |
| 223 | fWeight, noChangeCount, |
| 224 | buffer0, buffer0End, |
| 225 | buffer1, buffer1End, |
| 226 | buffer2, buffer2End); |
| 227 | } |
| 228 | |
| 229 | uint64_t fWeight; |
| 230 | int fBorder; |
| 231 | int fSlidingWindow; |
| 232 | int fPass0Size; |
| 233 | int fPass1Size; |
| 234 | int fPass2Size; |
| 235 | }; |
| 236 | |
| 237 | } // namespace |
| 238 | |
| 239 | // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow |
| 240 | // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The |
| 241 | // additional + 1 added to window represents adding one more leading element before subtracting the |
| 242 | // trailing element. |
| 243 | // Explanation of maximums: |
| 244 | // sum0 = (window + 1) * 255 |
| 245 | // sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255 |
| 246 | // sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255 |
| 247 | // |
| 248 | // The value (window + 1)^3 * 255 must fit in a uint32_t. So, |
| 249 | // (window + 1)^3 * 255 < 2^32. window = 255. |
| 250 | // |
| 251 | // window = floor(sigma * 3 * sqrt(2 * kPi) / 4) |
| 252 | // For window <= 255, the largest value for sigma is 135. |
| 253 | SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH) |
| 254 | : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)} |
| 255 | , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)} |
| 256 | { |
| 257 | SkASSERT(sigmaW >= 0); |
| 258 | SkASSERT(sigmaH >= 0); |
| 259 | } |
| 260 | |
| 261 | bool SkMaskBlurFilter::hasNoBlur() const { |
| 262 | return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1); |
| 263 | } |
| 264 | |
| 265 | // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first. |
| 266 | // Each of these converts width (up to 8) mask values to A8. |
| 267 | static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
| 268 | SkASSERT(0 < width && width <= 8); |
| 269 | |
| 270 | uint8_t masks = *from; |
| 271 | for (int i = 0; i < width; ++i) { |
| 272 | a8[i] = (masks >> (7 - i)) & 1 ? 0xFF |
| 273 | : 0x00; |
| 274 | } |
| 275 | } |
| 276 | static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
| 277 | SkASSERT(0 < width && width <= 8); |
| 278 | |
| 279 | for (int i = 0; i < width; ++i) { |
| 280 | unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i], |
| 281 | r = SkPacked16ToR32(rgb), |
| 282 | g = SkPacked16ToG32(rgb), |
| 283 | b = SkPacked16ToB32(rgb); |
| 284 | a8[i] = (r + g + b) / 3; |
| 285 | } |
| 286 | } |
| 287 | static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
| 288 | SkASSERT(0 < width && width <= 8); |
| 289 | for (int i = 0; i < width; ++i) { |
| 290 | uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i]; |
| 291 | a8[i] = SkGetPackedA32(rgba); |
| 292 | } |
| 293 | } |
| 294 | using ToA8 = decltype(bw_to_a8); |
| 295 | |
| 296 | static Sk8h load(const uint8_t* from, int width, ToA8* toA8) { |
| 297 | // Our fast path is a full 8-byte load of A8. |
| 298 | // So we'll conditionally handle the two slow paths using tmp: |
| 299 | // - if we have a function to convert another mask to A8, use it; |
| 300 | // - if not but we have less than 8 bytes to load, load them one at a time. |
| 301 | uint8_t tmp[8] = {0,0,0,0, 0,0,0,0}; |
| 302 | if (toA8) { |
| 303 | toA8(tmp, from, width); |
| 304 | from = tmp; |
| 305 | } else if (width < 8) { |
| 306 | for (int i = 0; i < width; ++i) { |
| 307 | tmp[i] = from[i]; |
| 308 | } |
| 309 | from = tmp; |
| 310 | } |
| 311 | |
| 312 | // Load A8 and convert to 8.8 fixed-point. |
| 313 | return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8; |
| 314 | } |
| 315 | |
| 316 | static void store(uint8_t* to, const Sk8h& v, int width) { |
| 317 | Sk8b b = SkNx_cast<uint8_t>(v >> 8); |
| 318 | if (width == 8) { |
| 319 | b.store(to); |
| 320 | } else { |
| 321 | uint8_t buffer[8]; |
| 322 | b.store(buffer); |
| 323 | for (int i = 0; i < width; i++) { |
| 324 | to[i] = buffer[i]; |
| 325 | } |
| 326 | } |
| 327 | }; |
| 328 | |
| 329 | static constexpr uint16_t _____ = 0u; |
| 330 | static constexpr uint16_t kHalf = 0x80u; |
| 331 | |
| 332 | // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded |
| 333 | // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8 |
| 334 | // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits |
| 335 | // resulting in a 8.8 format. |
| 336 | // |
| 337 | // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This |
| 338 | // system is setup to minimize the number of multiplies needed. |
| 339 | // |
| 340 | // Explanation: |
| 341 | // Blurring a specific mask value is given by the following equation where D_n is the resulting |
| 342 | // mask value and S_n is the source value. The example below is for a filter with a radius of 1 |
| 343 | // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are |
| 344 | // aligned. The filter is given by G_n where n is the symmetric filter value. |
| 345 | // |
| 346 | // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1]. |
| 347 | // |
| 348 | // We can start the source index at an offset relative to the destination separated by the |
| 349 | // radius. This results in a non-traditional restating of the above filter. |
| 350 | // |
| 351 | // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1] |
| 352 | // |
| 353 | // If we look at three specific consecutive destinations the following equations result: |
| 354 | // |
| 355 | // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1] |
| 356 | // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1] |
| 357 | // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1]. |
| 358 | // |
| 359 | // In the above equations, notice that S[7] is used in all three. In particular, two values are |
| 360 | // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and |
| 361 | // D[7]. |
| 362 | // |
| 363 | // From the point of view of a source value we end up with the following three equations. |
| 364 | // |
| 365 | // Given S[7]: |
| 366 | // D[5] += S[7]*G[1] |
| 367 | // D[6] += S[7]*G[0] |
| 368 | // D[7] += S[7]*G[1] |
| 369 | // |
| 370 | // In General: |
| 371 | // D[n] += S[n]*G[1] |
| 372 | // D[n+1] += S[n]*G[0] |
| 373 | // D[n+2] += S[n]*G[1] |
| 374 | // |
| 375 | // Now these equations can be ganged using SIMD to form: |
| 376 | // D[n..n+7] += S[n..n+7]*G[1] |
| 377 | // D[n+1..n+8] += S[n..n+7]*G[0] |
| 378 | // D[n+2..n+9] += S[n..n+7]*G[1] |
| 379 | // The next set of values becomes. |
| 380 | // D[n+8..n+15] += S[n+8..n+15]*G[1] |
| 381 | // D[n+9..n+16] += S[n+8..n+15]*G[0] |
| 382 | // D[n+10..n+17] += S[n+8..n+15]*G[1] |
| 383 | // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both |
| 384 | // S[n..7] and S[n+8..n+15]. |
| 385 | // |
| 386 | // Just one more transformation allows the code to maintain all working values in |
| 387 | // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is |
| 388 | // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}. |
| 389 | // |
| 390 | // D[n..n+7] += S[n..n+7] * G[1] |
| 391 | // D[n..n+8] += {0, S[n..n+7] * G[0]} |
| 392 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
| 393 | // |
| 394 | // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a |
| 395 | // register d8. In addition, S[0..n+7] becomes s0. |
| 396 | // |
| 397 | // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below. |
| 398 | // |
| 399 | // Sk8h v0 = s0*G[0] |
| 400 | // Sk8h v1 = s0*G[1] |
| 401 | // /* D[n..n+7] += S[n..n+7] * G[1] */ |
| 402 | // d0 += v1; |
| 403 | // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */ |
| 404 | // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]} |
| 405 | // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____} |
| 406 | // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */ |
| 407 | // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]} |
| 408 | // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} |
| 409 | // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. |
| 410 | |
| 411 | static void blur_x_radius_1( |
| 412 | const Sk8h& s0, |
| 413 | const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, |
| 414 | Sk8h* d0, Sk8h* d8) { |
| 415 | |
| 416 | auto v1 = s0.mulHi(g1); |
| 417 | auto v0 = s0.mulHi(g0); |
| 418 | |
| 419 | // D[n..n+7] += S[n..n+7] * G[1] |
| 420 | *d0 += v1; |
| 421 | |
| 422 | //D[n..n+8] += {0, S[n..n+7] * G[0]} |
| 423 | *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; |
| 424 | *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____}; |
| 425 | |
| 426 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
| 427 | *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; |
| 428 | *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; |
| 429 | |
| 430 | } |
| 431 | |
| 432 | static void blur_x_radius_2( |
| 433 | const Sk8h& s0, |
| 434 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, |
| 435 | Sk8h* d0, Sk8h* d8) { |
| 436 | auto v0 = s0.mulHi(g0); |
| 437 | auto v1 = s0.mulHi(g1); |
| 438 | auto v2 = s0.mulHi(g2); |
| 439 | |
| 440 | // D[n..n+7] += S[n..n+7] * G[2] |
| 441 | *d0 += v2; |
| 442 | |
| 443 | // D[n..n+8] += {0, S[n..n+7] * G[1]} |
| 444 | *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; |
| 445 | *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____}; |
| 446 | |
| 447 | // D[n..n+9] += {0, 0, S[n..n+7] * G[0]} |
| 448 | *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; |
| 449 | *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; |
| 450 | |
| 451 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} |
| 452 | *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; |
| 453 | *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; |
| 454 | |
| 455 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]} |
| 456 | *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; |
| 457 | *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; |
| 458 | } |
| 459 | |
| 460 | static void blur_x_radius_3( |
| 461 | const Sk8h& s0, |
| 462 | const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, |
| 463 | Sk8h* d0, Sk8h* d8) { |
| 464 | auto v0 = s0.mulHi(gauss0); |
| 465 | auto v1 = s0.mulHi(gauss1); |
| 466 | auto v2 = s0.mulHi(gauss2); |
| 467 | auto v3 = s0.mulHi(gauss3); |
| 468 | |
| 469 | // D[n..n+7] += S[n..n+7] * G[3] |
| 470 | *d0 += v3; |
| 471 | |
| 472 | // D[n..n+8] += {0, S[n..n+7] * G[2]} |
| 473 | *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; |
| 474 | *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____}; |
| 475 | |
| 476 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
| 477 | *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; |
| 478 | *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; |
| 479 | |
| 480 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]} |
| 481 | *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; |
| 482 | *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; |
| 483 | |
| 484 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]} |
| 485 | *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; |
| 486 | *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; |
| 487 | |
| 488 | // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]} |
| 489 | *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; |
| 490 | *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; |
| 491 | |
| 492 | // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} |
| 493 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; |
| 494 | *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; |
| 495 | } |
| 496 | |
| 497 | static void blur_x_radius_4( |
| 498 | const Sk8h& s0, |
| 499 | const Sk8h& gauss0, |
| 500 | const Sk8h& gauss1, |
| 501 | const Sk8h& gauss2, |
| 502 | const Sk8h& gauss3, |
| 503 | const Sk8h& gauss4, |
| 504 | Sk8h* d0, Sk8h* d8) { |
| 505 | auto v0 = s0.mulHi(gauss0); |
| 506 | auto v1 = s0.mulHi(gauss1); |
| 507 | auto v2 = s0.mulHi(gauss2); |
| 508 | auto v3 = s0.mulHi(gauss3); |
| 509 | auto v4 = s0.mulHi(gauss4); |
| 510 | |
| 511 | // D[n..n+7] += S[n..n+7] * G[4] |
| 512 | *d0 += v4; |
| 513 | |
| 514 | // D[n..n+8] += {0, S[n..n+7] * G[3]} |
| 515 | *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; |
| 516 | *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____}; |
| 517 | |
| 518 | // D[n..n+9] += {0, 0, S[n..n+7] * G[2]} |
| 519 | *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; |
| 520 | *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; |
| 521 | |
| 522 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} |
| 523 | *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; |
| 524 | *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; |
| 525 | |
| 526 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]} |
| 527 | *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; |
| 528 | *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; |
| 529 | |
| 530 | // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]} |
| 531 | *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; |
| 532 | *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; |
| 533 | |
| 534 | // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]} |
| 535 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; |
| 536 | *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; |
| 537 | |
| 538 | // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} |
| 539 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]}; |
| 540 | *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; |
| 541 | |
| 542 | // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]} |
| 543 | *d8 += v4; |
| 544 | } |
| 545 | |
| 546 | using BlurX = decltype(blur_x_radius_1); |
| 547 | |
| 548 | // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). |
| 549 | static void blur_row( |
| 550 | BlurX blur, |
| 551 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
| 552 | const uint8_t* src, int srcW, |
| 553 | uint8_t* dst, int dstW) { |
| 554 | // Clear the buffer to handle summing wider than source. |
| 555 | Sk8h d0{kHalf}, d8{kHalf}; |
| 556 | |
| 557 | // Go by multiples of 8 in src. |
| 558 | int x = 0; |
| 559 | for (; x <= srcW - 8; x += 8) { |
| 560 | blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8); |
| 561 | |
| 562 | store(dst, d0, 8); |
| 563 | |
| 564 | d0 = d8; |
| 565 | d8 = Sk8h{kHalf}; |
| 566 | |
| 567 | src += 8; |
| 568 | dst += 8; |
| 569 | } |
| 570 | |
| 571 | // There are src values left, but the remainder of src values is not a multiple of 8. |
| 572 | int srcTail = srcW - x; |
| 573 | if (srcTail > 0) { |
| 574 | |
| 575 | blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8); |
| 576 | |
| 577 | int dstTail = std::min(8, dstW - x); |
| 578 | store(dst, d0, dstTail); |
| 579 | |
| 580 | d0 = d8; |
| 581 | dst += dstTail; |
| 582 | x += dstTail; |
| 583 | } |
| 584 | |
| 585 | // There are dst mask values to complete. |
| 586 | int dstTail = dstW - x; |
| 587 | if (dstTail > 0) { |
| 588 | store(dst, d0, dstTail); |
| 589 | } |
| 590 | } |
| 591 | |
| 592 | // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). |
| 593 | static void blur_x_rect(BlurX blur, |
| 594 | uint16_t* gauss, |
| 595 | const uint8_t* src, size_t srcStride, int srcW, |
| 596 | uint8_t* dst, size_t dstStride, int dstW, int dstH) { |
| 597 | |
| 598 | Sk8h g0{gauss[0]}, |
| 599 | g1{gauss[1]}, |
| 600 | g2{gauss[2]}, |
| 601 | g3{gauss[3]}, |
| 602 | g4{gauss[4]}; |
| 603 | |
| 604 | // Blur *ALL* the rows. |
| 605 | for (int y = 0; y < dstH; y++) { |
| 606 | blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW); |
| 607 | src += srcStride; |
| 608 | dst += dstStride; |
| 609 | } |
| 610 | } |
| 611 | |
| 612 | static void direct_blur_x(int radius, uint16_t* gauss, |
| 613 | const uint8_t* src, size_t srcStride, int srcW, |
| 614 | uint8_t* dst, size_t dstStride, int dstW, int dstH) { |
| 615 | |
| 616 | switch (radius) { |
| 617 | case 1: |
| 618 | blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
| 619 | break; |
| 620 | |
| 621 | case 2: |
| 622 | blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
| 623 | break; |
| 624 | |
| 625 | case 3: |
| 626 | blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
| 627 | break; |
| 628 | |
| 629 | case 4: |
| 630 | blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
| 631 | break; |
| 632 | |
| 633 | default: |
| 634 | SkASSERTF(false, "The radius %d is not handled\n" , radius); |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N |
| 639 | // functions, but end up being simpler because there is no complicated shift of registers. We |
| 640 | // start with the non-traditional form of the gaussian filter. In the following r is the value |
| 641 | // when added generates the next value in the column. |
| 642 | // |
| 643 | // D[n+0r] = S[n+0r]*G[1] |
| 644 | // + S[n+1r]*G[0] |
| 645 | // + S[n+2r]*G[1] |
| 646 | // |
| 647 | // Expanding out in a way similar to blur_x_radius_N for specific values of n. |
| 648 | // |
| 649 | // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1] |
| 650 | // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1] |
| 651 | // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1] |
| 652 | // |
| 653 | // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we |
| 654 | // can look at the calculation form the point of view of a source value. |
| 655 | // |
| 656 | // Given S[n+0r]: |
| 657 | // D[n+0r] += S[n+0r]*G[1]; |
| 658 | // /* D[n+0r] is done and can be stored now. */ |
| 659 | // D[n+1r] += S[n+0r]*G[0]; |
| 660 | // D[n+2r] = S[n+0r]*G[1]; |
| 661 | // |
| 662 | // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in |
| 663 | // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also, |
| 664 | // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice |
| 665 | // how values flow in the following two iterations in source. |
| 666 | // |
| 667 | // D[n+0r] += S[n+0r]*G[1] |
| 668 | // D[n+1r] += S[n+0r]*G[0] |
| 669 | // D[n+2r] = S[n+0r]*G[1] |
| 670 | // /* ------- */ |
| 671 | // D[n+1r] += S[n+1r]*G[1] |
| 672 | // D[n+2r] += S[n+1r]*G[0] |
| 673 | // D[n+3r] = S[n+1r]*G[1] |
| 674 | // |
| 675 | // Instead of using memory we can introduce temporaries d01 and d12. The update step changes |
| 676 | // to the following. |
| 677 | // |
| 678 | // answer = d01 + S[n+0r]*G[1] |
| 679 | // d01 = d12 + S[n+0r]*G[0] |
| 680 | // d12 = S[n+0r]*G[1] |
| 681 | // return answer |
| 682 | // |
| 683 | // Finally, this can be ganged into SIMD style. |
| 684 | // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1] |
| 685 | // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] |
| 686 | // d12[0..7] = S[n+0r..n+0r+7]*G[1] |
| 687 | // return answer[0..7] |
| 688 | static Sk8h blur_y_radius_1( |
| 689 | const Sk8h& s0, |
| 690 | const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, |
| 691 | Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { |
| 692 | auto v0 = s0.mulHi(g0); |
| 693 | auto v1 = s0.mulHi(g1); |
| 694 | |
| 695 | Sk8h answer = *d01 + v1; |
| 696 | *d01 = *d12 + v0; |
| 697 | *d12 = v1 + kHalf; |
| 698 | |
| 699 | return answer; |
| 700 | } |
| 701 | |
| 702 | static Sk8h blur_y_radius_2( |
| 703 | const Sk8h& s0, |
| 704 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, |
| 705 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { |
| 706 | auto v0 = s0.mulHi(g0); |
| 707 | auto v1 = s0.mulHi(g1); |
| 708 | auto v2 = s0.mulHi(g2); |
| 709 | |
| 710 | Sk8h answer = *d01 + v2; |
| 711 | *d01 = *d12 + v1; |
| 712 | *d12 = *d23 + v0; |
| 713 | *d23 = *d34 + v1; |
| 714 | *d34 = v2 + kHalf; |
| 715 | |
| 716 | return answer; |
| 717 | } |
| 718 | |
| 719 | static Sk8h blur_y_radius_3( |
| 720 | const Sk8h& s0, |
| 721 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, |
| 722 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { |
| 723 | auto v0 = s0.mulHi(g0); |
| 724 | auto v1 = s0.mulHi(g1); |
| 725 | auto v2 = s0.mulHi(g2); |
| 726 | auto v3 = s0.mulHi(g3); |
| 727 | |
| 728 | Sk8h answer = *d01 + v3; |
| 729 | *d01 = *d12 + v2; |
| 730 | *d12 = *d23 + v1; |
| 731 | *d23 = *d34 + v0; |
| 732 | *d34 = *d45 + v1; |
| 733 | *d45 = *d56 + v2; |
| 734 | *d56 = v3 + kHalf; |
| 735 | |
| 736 | return answer; |
| 737 | } |
| 738 | |
| 739 | static Sk8h blur_y_radius_4( |
| 740 | const Sk8h& s0, |
| 741 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
| 742 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { |
| 743 | auto v0 = s0.mulHi(g0); |
| 744 | auto v1 = s0.mulHi(g1); |
| 745 | auto v2 = s0.mulHi(g2); |
| 746 | auto v3 = s0.mulHi(g3); |
| 747 | auto v4 = s0.mulHi(g4); |
| 748 | |
| 749 | Sk8h answer = *d01 + v4; |
| 750 | *d01 = *d12 + v3; |
| 751 | *d12 = *d23 + v2; |
| 752 | *d23 = *d34 + v1; |
| 753 | *d34 = *d45 + v0; |
| 754 | *d45 = *d56 + v1; |
| 755 | *d56 = *d67 + v2; |
| 756 | *d67 = *d78 + v3; |
| 757 | *d78 = v4 + kHalf; |
| 758 | |
| 759 | return answer; |
| 760 | } |
| 761 | |
| 762 | using BlurY = decltype(blur_y_radius_1); |
| 763 | |
| 764 | // BlurY will be one of blur_y_radius_(1|2|3|4). |
| 765 | static void blur_column( |
| 766 | ToA8 toA8, |
| 767 | BlurY blur, int radius, int width, |
| 768 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
| 769 | const uint8_t* src, size_t srcRB, int srcH, |
| 770 | uint8_t* dst, size_t dstRB) { |
| 771 | Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf}, |
| 772 | d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf}; |
| 773 | |
| 774 | auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) { |
| 775 | store(to, v0, width); |
| 776 | to += dstRB; |
| 777 | store(to, v1, width); |
| 778 | return to + dstRB; |
| 779 | }; |
| 780 | |
| 781 | for (int y = 0; y < srcH; y += 1) { |
| 782 | auto s = load(src, width, toA8); |
| 783 | auto b = blur(s, |
| 784 | g0, g1, g2, g3, g4, |
| 785 | &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78); |
| 786 | store(dst, b, width); |
| 787 | src += srcRB; |
| 788 | dst += dstRB; |
| 789 | } |
| 790 | |
| 791 | if (radius >= 1) { |
| 792 | dst = flush(dst, d01, d12); |
| 793 | } |
| 794 | if (radius >= 2) { |
| 795 | dst = flush(dst, d23, d34); |
| 796 | } |
| 797 | if (radius >= 3) { |
| 798 | dst = flush(dst, d45, d56); |
| 799 | } |
| 800 | if (radius >= 4) { |
| 801 | flush(dst, d67, d78); |
| 802 | } |
| 803 | } |
| 804 | |
| 805 | // BlurY will be one of blur_y_radius_(1|2|3|4). |
| 806 | static void blur_y_rect(ToA8 toA8, const int strideOf8, |
| 807 | BlurY blur, int radius, uint16_t *gauss, |
| 808 | const uint8_t *src, size_t srcRB, int srcW, int srcH, |
| 809 | uint8_t *dst, size_t dstRB) { |
| 810 | |
| 811 | Sk8h g0{gauss[0]}, |
| 812 | g1{gauss[1]}, |
| 813 | g2{gauss[2]}, |
| 814 | g3{gauss[3]}, |
| 815 | g4{gauss[4]}; |
| 816 | |
| 817 | int x = 0; |
| 818 | for (; x <= srcW - 8; x += 8) { |
| 819 | blur_column(toA8, blur, radius, 8, |
| 820 | g0, g1, g2, g3, g4, |
| 821 | src, srcRB, srcH, |
| 822 | dst, dstRB); |
| 823 | src += strideOf8; |
| 824 | dst += 8; |
| 825 | } |
| 826 | |
| 827 | int xTail = srcW - x; |
| 828 | if (xTail > 0) { |
| 829 | blur_column(toA8, blur, radius, xTail, |
| 830 | g0, g1, g2, g3, g4, |
| 831 | src, srcRB, srcH, |
| 832 | dst, dstRB); |
| 833 | } |
| 834 | } |
| 835 | |
| 836 | static void direct_blur_y(ToA8 toA8, const int strideOf8, |
| 837 | int radius, uint16_t* gauss, |
| 838 | const uint8_t* src, size_t srcRB, int srcW, int srcH, |
| 839 | uint8_t* dst, size_t dstRB) { |
| 840 | |
| 841 | switch (radius) { |
| 842 | case 1: |
| 843 | blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss, |
| 844 | src, srcRB, srcW, srcH, |
| 845 | dst, dstRB); |
| 846 | break; |
| 847 | |
| 848 | case 2: |
| 849 | blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss, |
| 850 | src, srcRB, srcW, srcH, |
| 851 | dst, dstRB); |
| 852 | break; |
| 853 | |
| 854 | case 3: |
| 855 | blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss, |
| 856 | src, srcRB, srcW, srcH, |
| 857 | dst, dstRB); |
| 858 | break; |
| 859 | |
| 860 | case 4: |
| 861 | blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss, |
| 862 | src, srcRB, srcW, srcH, |
| 863 | dst, dstRB); |
| 864 | break; |
| 865 | |
| 866 | default: |
| 867 | SkASSERTF(false, "The radius %d is not handled\n" , radius); |
| 868 | } |
| 869 | } |
| 870 | |
| 871 | static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) { |
| 872 | SkASSERT(sigmaX == sigmaY); // TODO |
| 873 | SkASSERT(0.01 <= sigmaX && sigmaX < 2); |
| 874 | SkASSERT(0.01 <= sigmaY && sigmaY < 2); |
| 875 | |
| 876 | SkGaussFilter filterX{sigmaX}, |
| 877 | filterY{sigmaY}; |
| 878 | |
| 879 | int radiusX = filterX.radius(), |
| 880 | radiusY = filterY.radius(); |
| 881 | |
| 882 | SkASSERT(radiusX <= 4 && radiusY <= 4); |
| 883 | |
| 884 | auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) { |
| 885 | int i = 0; |
| 886 | for (double d : filter) { |
| 887 | factors[i++] = static_cast<uint16_t>(round(d * (1 << 16))); |
| 888 | } |
| 889 | }; |
| 890 | |
| 891 | uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax], |
| 892 | gaussFactorsY[SkGaussFilter::kGaussArrayMax]; |
| 893 | |
| 894 | prepareGauss(filterX, gaussFactorsX); |
| 895 | prepareGauss(filterY, gaussFactorsY); |
| 896 | |
| 897 | *dst = SkMask::PrepareDestination(radiusX, radiusY, src); |
| 898 | if (src.fImage == nullptr) { |
| 899 | return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)}; |
| 900 | } |
| 901 | if (dst->fImage == nullptr) { |
| 902 | dst->fBounds.setEmpty(); |
| 903 | return {0, 0}; |
| 904 | } |
| 905 | |
| 906 | int srcW = src.fBounds.width(), |
| 907 | srcH = src.fBounds.height(); |
| 908 | |
| 909 | int dstW = dst->fBounds.width(), |
| 910 | dstH = dst->fBounds.height(); |
| 911 | |
| 912 | size_t srcRB = src.fRowBytes, |
| 913 | dstRB = dst->fRowBytes; |
| 914 | |
| 915 | //TODO: handle bluring in only one direction. |
| 916 | |
| 917 | // Blur vertically and copy to destination. |
| 918 | switch (src.fFormat) { |
| 919 | case SkMask::kBW_Format: |
| 920 | direct_blur_y(bw_to_a8, 1, |
| 921 | radiusY, gaussFactorsY, |
| 922 | src.fImage, srcRB, srcW, srcH, |
| 923 | dst->fImage + radiusX, dstRB); |
| 924 | break; |
| 925 | case SkMask::kA8_Format: |
| 926 | direct_blur_y(nullptr, 8, |
| 927 | radiusY, gaussFactorsY, |
| 928 | src.fImage, srcRB, srcW, srcH, |
| 929 | dst->fImage + radiusX, dstRB); |
| 930 | break; |
| 931 | case SkMask::kARGB32_Format: |
| 932 | direct_blur_y(argb32_to_a8, 32, |
| 933 | radiusY, gaussFactorsY, |
| 934 | src.fImage, srcRB, srcW, srcH, |
| 935 | dst->fImage + radiusX, dstRB); |
| 936 | break; |
| 937 | case SkMask::kLCD16_Format: |
| 938 | direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY, |
| 939 | src.fImage, srcRB, srcW, srcH, |
| 940 | dst->fImage + radiusX, dstRB); |
| 941 | break; |
| 942 | default: |
| 943 | SK_ABORT("Unhandled format." ); |
| 944 | } |
| 945 | |
| 946 | // Blur horizontally in place. |
| 947 | direct_blur_x(radiusX, gaussFactorsX, |
| 948 | dst->fImage + radiusX, dstRB, srcW, |
| 949 | dst->fImage, dstRB, dstW, dstH); |
| 950 | |
| 951 | return {radiusX, radiusY}; |
| 952 | } |
| 953 | |
| 954 | // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the |
| 955 | // API forces the sigmas to be the same. |
| 956 | SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { |
| 957 | |
| 958 | if (fSigmaW < 2.0 && fSigmaH < 2.0) { |
| 959 | return small_blur(fSigmaW, fSigmaH, src, dst); |
| 960 | } |
| 961 | |
| 962 | // 1024 is a place holder guess until more analysis can be done. |
| 963 | SkSTArenaAlloc<1024> alloc; |
| 964 | |
| 965 | PlanGauss planW(fSigmaW); |
| 966 | PlanGauss planH(fSigmaH); |
| 967 | |
| 968 | int borderW = planW.border(), |
| 969 | borderH = planH.border(); |
| 970 | SkASSERT(borderH >= 0 && borderW >= 0); |
| 971 | |
| 972 | *dst = SkMask::PrepareDestination(borderW, borderH, src); |
| 973 | if (src.fImage == nullptr) { |
| 974 | return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; |
| 975 | } |
| 976 | if (dst->fImage == nullptr) { |
| 977 | dst->fBounds.setEmpty(); |
| 978 | return {0, 0}; |
| 979 | } |
| 980 | |
| 981 | int srcW = src.fBounds.width(), |
| 982 | srcH = src.fBounds.height(), |
| 983 | dstW = dst->fBounds.width(), |
| 984 | dstH = dst->fBounds.height(); |
| 985 | SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0); |
| 986 | |
| 987 | auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize()); |
| 988 | auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize); |
| 989 | |
| 990 | // Blur both directions. |
| 991 | int tmpW = srcH, |
| 992 | tmpH = dstW; |
| 993 | |
| 994 | auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH); |
| 995 | |
| 996 | // Blur horizontally, and transpose. |
| 997 | const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer); |
| 998 | switch (src.fFormat) { |
| 999 | case SkMask::kBW_Format: { |
| 1000 | const uint8_t* bwStart = src.fImage; |
| 1001 | auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0); |
| 1002 | auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8); |
| 1003 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
| 1004 | auto tmpStart = &tmp[y]; |
| 1005 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
| 1006 | } |
| 1007 | } break; |
| 1008 | case SkMask::kA8_Format: { |
| 1009 | const uint8_t* a8Start = src.fImage; |
| 1010 | auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start); |
| 1011 | auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW); |
| 1012 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
| 1013 | auto tmpStart = &tmp[y]; |
| 1014 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
| 1015 | } |
| 1016 | } break; |
| 1017 | case SkMask::kARGB32_Format: { |
| 1018 | const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage); |
| 1019 | auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart); |
| 1020 | auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW); |
| 1021 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
| 1022 | auto tmpStart = &tmp[y]; |
| 1023 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
| 1024 | } |
| 1025 | } break; |
| 1026 | case SkMask::kLCD16_Format: { |
| 1027 | const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage); |
| 1028 | auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart); |
| 1029 | auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW); |
| 1030 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
| 1031 | auto tmpStart = &tmp[y]; |
| 1032 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
| 1033 | } |
| 1034 | } break; |
| 1035 | default: |
| 1036 | SK_ABORT("Unhandled format." ); |
| 1037 | } |
| 1038 | |
| 1039 | // Blur vertically (scan in memory order because of the transposition), |
| 1040 | // and transpose back to the original orientation. |
| 1041 | const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer); |
| 1042 | for (int y = 0; y < tmpH; y++) { |
| 1043 | auto tmpStart = &tmp[y * tmpW]; |
| 1044 | auto dstStart = &dst->fImage[y]; |
| 1045 | |
| 1046 | scanH.blur(tmpStart, tmpStart + tmpW, |
| 1047 | dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); |
| 1048 | } |
| 1049 | |
| 1050 | return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; |
| 1051 | } |
| 1052 | |