1 | /* |
2 | * Copyright 2017 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #include "src/core/SkMaskBlurFilter.h" |
9 | |
10 | #include "include/core/SkColorPriv.h" |
11 | #include "include/private/SkMalloc.h" |
12 | #include "include/private/SkNx.h" |
13 | #include "include/private/SkTemplates.h" |
14 | #include "include/private/SkTo.h" |
15 | #include "src/core/SkArenaAlloc.h" |
16 | #include "src/core/SkGaussFilter.h" |
17 | |
18 | #include <cmath> |
19 | #include <climits> |
20 | |
21 | namespace { |
22 | static const double kPi = 3.14159265358979323846264338327950288; |
23 | |
24 | class PlanGauss final { |
25 | public: |
26 | explicit PlanGauss(double sigma) { |
27 | auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)); |
28 | auto window = std::max(1, possibleWindow); |
29 | |
30 | fPass0Size = window - 1; |
31 | fPass1Size = window - 1; |
32 | fPass2Size = (window & 1) == 1 ? window - 1 : window; |
33 | |
34 | // Calculating the border is tricky. I will go through the odd case which is simpler, and |
35 | // then through the even case. Given a stack of filters seven wide for the odd case of |
36 | // three passes. |
37 | // |
38 | // S |
39 | // aaaAaaa |
40 | // bbbBbbb |
41 | // cccCccc |
42 | // D |
43 | // |
44 | // The furthest changed pixel is when the filters are in the following configuration. |
45 | // |
46 | // S |
47 | // aaaAaaa |
48 | // bbbBbbb |
49 | // cccCccc |
50 | // D |
51 | // |
52 | // The A pixel is calculated using the value S, the B uses A, and the C uses B, and |
53 | // finally D is C. So, with a window size of seven the border is nine. In general, the |
54 | // border is 3*((window - 1)/2). |
55 | // |
56 | // For even cases the filter stack is more complicated. The spec specifies two passes |
57 | // of even filters and a final pass of odd filters. A stack for a width of six looks like |
58 | // this. |
59 | // |
60 | // S |
61 | // aaaAaa |
62 | // bbBbbb |
63 | // cccCccc |
64 | // D |
65 | // |
66 | // The furthest pixel looks like this. |
67 | // |
68 | // S |
69 | // aaaAaa |
70 | // bbBbbb |
71 | // cccCccc |
72 | // D |
73 | // |
74 | // For a window of size, the border value is seven. In general the border is 3 * |
75 | // (window/2) -1. |
76 | fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; |
77 | fSlidingWindow = 2 * fBorder + 1; |
78 | |
79 | // If the window is odd then the divisor is just window ^ 3 otherwise, |
80 | // it is window * window * (window + 1) = window ^ 2 + window ^ 3; |
81 | auto window2 = window * window; |
82 | auto window3 = window2 * window; |
83 | auto divisor = (window & 1) == 1 ? window3 : window3 + window2; |
84 | |
85 | fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32))); |
86 | } |
87 | |
88 | size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; } |
89 | |
90 | int border() const { return fBorder; } |
91 | |
92 | public: |
93 | class Scan { |
94 | public: |
95 | Scan(uint64_t weight, int noChangeCount, |
96 | uint32_t* buffer0, uint32_t* buffer0End, |
97 | uint32_t* buffer1, uint32_t* buffer1End, |
98 | uint32_t* buffer2, uint32_t* buffer2End) |
99 | : fWeight{weight} |
100 | , fNoChangeCount{noChangeCount} |
101 | , fBuffer0{buffer0} |
102 | , fBuffer0End{buffer0End} |
103 | , fBuffer1{buffer1} |
104 | , fBuffer1End{buffer1End} |
105 | , fBuffer2{buffer2} |
106 | , fBuffer2End{buffer2End} |
107 | { } |
108 | |
109 | template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd, |
110 | uint8_t* dst, int dstStride, uint8_t* dstEnd) const { |
111 | auto buffer0Cursor = fBuffer0; |
112 | auto buffer1Cursor = fBuffer1; |
113 | auto buffer2Cursor = fBuffer2; |
114 | |
115 | std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); |
116 | |
117 | uint32_t sum0 = 0; |
118 | uint32_t sum1 = 0; |
119 | uint32_t sum2 = 0; |
120 | |
121 | // Consume the source generating pixels. |
122 | for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) { |
123 | uint32_t leadingEdge = *src; |
124 | sum0 += leadingEdge; |
125 | sum1 += sum0; |
126 | sum2 += sum1; |
127 | |
128 | *dst = this->finalScale(sum2); |
129 | |
130 | sum2 -= *buffer2Cursor; |
131 | *buffer2Cursor = sum1; |
132 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
133 | |
134 | sum1 -= *buffer1Cursor; |
135 | *buffer1Cursor = sum0; |
136 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
137 | |
138 | sum0 -= *buffer0Cursor; |
139 | *buffer0Cursor = leadingEdge; |
140 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
141 | } |
142 | |
143 | // The leading edge is off the right side of the mask. |
144 | for (int i = 0; i < fNoChangeCount; i++) { |
145 | uint32_t leadingEdge = 0; |
146 | sum0 += leadingEdge; |
147 | sum1 += sum0; |
148 | sum2 += sum1; |
149 | |
150 | *dst = this->finalScale(sum2); |
151 | |
152 | sum2 -= *buffer2Cursor; |
153 | *buffer2Cursor = sum1; |
154 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
155 | |
156 | sum1 -= *buffer1Cursor; |
157 | *buffer1Cursor = sum0; |
158 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
159 | |
160 | sum0 -= *buffer0Cursor; |
161 | *buffer0Cursor = leadingEdge; |
162 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
163 | |
164 | dst += dstStride; |
165 | } |
166 | |
167 | // Starting from the right, fill in the rest of the buffer. |
168 | std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); |
169 | |
170 | sum0 = sum1 = sum2 = 0; |
171 | |
172 | uint8_t* dstCursor = dstEnd; |
173 | AlphaIter src = srcEnd; |
174 | while (dstCursor > dst) { |
175 | dstCursor -= dstStride; |
176 | uint32_t leadingEdge = *(--src); |
177 | sum0 += leadingEdge; |
178 | sum1 += sum0; |
179 | sum2 += sum1; |
180 | |
181 | *dstCursor = this->finalScale(sum2); |
182 | |
183 | sum2 -= *buffer2Cursor; |
184 | *buffer2Cursor = sum1; |
185 | buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
186 | |
187 | sum1 -= *buffer1Cursor; |
188 | *buffer1Cursor = sum0; |
189 | buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
190 | |
191 | sum0 -= *buffer0Cursor; |
192 | *buffer0Cursor = leadingEdge; |
193 | buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
194 | } |
195 | } |
196 | |
197 | private: |
198 | static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; |
199 | |
200 | uint8_t finalScale(uint32_t sum) const { |
201 | return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32); |
202 | } |
203 | |
204 | uint64_t fWeight; |
205 | int fNoChangeCount; |
206 | uint32_t* fBuffer0; |
207 | uint32_t* fBuffer0End; |
208 | uint32_t* fBuffer1; |
209 | uint32_t* fBuffer1End; |
210 | uint32_t* fBuffer2; |
211 | uint32_t* fBuffer2End; |
212 | }; |
213 | |
214 | Scan makeBlurScan(int width, uint32_t* buffer) const { |
215 | uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End; |
216 | buffer0 = buffer; |
217 | buffer0End = buffer1 = buffer0 + fPass0Size; |
218 | buffer1End = buffer2 = buffer1 + fPass1Size; |
219 | buffer2End = buffer2 + fPass2Size; |
220 | int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0; |
221 | |
222 | return Scan( |
223 | fWeight, noChangeCount, |
224 | buffer0, buffer0End, |
225 | buffer1, buffer1End, |
226 | buffer2, buffer2End); |
227 | } |
228 | |
229 | uint64_t fWeight; |
230 | int fBorder; |
231 | int fSlidingWindow; |
232 | int fPass0Size; |
233 | int fPass1Size; |
234 | int fPass2Size; |
235 | }; |
236 | |
237 | } // namespace |
238 | |
239 | // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow |
240 | // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The |
241 | // additional + 1 added to window represents adding one more leading element before subtracting the |
242 | // trailing element. |
243 | // Explanation of maximums: |
244 | // sum0 = (window + 1) * 255 |
245 | // sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255 |
246 | // sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255 |
247 | // |
248 | // The value (window + 1)^3 * 255 must fit in a uint32_t. So, |
249 | // (window + 1)^3 * 255 < 2^32. window = 255. |
250 | // |
251 | // window = floor(sigma * 3 * sqrt(2 * kPi) / 4) |
252 | // For window <= 255, the largest value for sigma is 135. |
253 | SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH) |
254 | : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)} |
255 | , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)} |
256 | { |
257 | SkASSERT(sigmaW >= 0); |
258 | SkASSERT(sigmaH >= 0); |
259 | } |
260 | |
261 | bool SkMaskBlurFilter::hasNoBlur() const { |
262 | return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1); |
263 | } |
264 | |
265 | // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first. |
266 | // Each of these converts width (up to 8) mask values to A8. |
267 | static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
268 | SkASSERT(0 < width && width <= 8); |
269 | |
270 | uint8_t masks = *from; |
271 | for (int i = 0; i < width; ++i) { |
272 | a8[i] = (masks >> (7 - i)) & 1 ? 0xFF |
273 | : 0x00; |
274 | } |
275 | } |
276 | static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
277 | SkASSERT(0 < width && width <= 8); |
278 | |
279 | for (int i = 0; i < width; ++i) { |
280 | unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i], |
281 | r = SkPacked16ToR32(rgb), |
282 | g = SkPacked16ToG32(rgb), |
283 | b = SkPacked16ToB32(rgb); |
284 | a8[i] = (r + g + b) / 3; |
285 | } |
286 | } |
287 | static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { |
288 | SkASSERT(0 < width && width <= 8); |
289 | for (int i = 0; i < width; ++i) { |
290 | uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i]; |
291 | a8[i] = SkGetPackedA32(rgba); |
292 | } |
293 | } |
294 | using ToA8 = decltype(bw_to_a8); |
295 | |
296 | static Sk8h load(const uint8_t* from, int width, ToA8* toA8) { |
297 | // Our fast path is a full 8-byte load of A8. |
298 | // So we'll conditionally handle the two slow paths using tmp: |
299 | // - if we have a function to convert another mask to A8, use it; |
300 | // - if not but we have less than 8 bytes to load, load them one at a time. |
301 | uint8_t tmp[8] = {0,0,0,0, 0,0,0,0}; |
302 | if (toA8) { |
303 | toA8(tmp, from, width); |
304 | from = tmp; |
305 | } else if (width < 8) { |
306 | for (int i = 0; i < width; ++i) { |
307 | tmp[i] = from[i]; |
308 | } |
309 | from = tmp; |
310 | } |
311 | |
312 | // Load A8 and convert to 8.8 fixed-point. |
313 | return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8; |
314 | } |
315 | |
316 | static void store(uint8_t* to, const Sk8h& v, int width) { |
317 | Sk8b b = SkNx_cast<uint8_t>(v >> 8); |
318 | if (width == 8) { |
319 | b.store(to); |
320 | } else { |
321 | uint8_t buffer[8]; |
322 | b.store(buffer); |
323 | for (int i = 0; i < width; i++) { |
324 | to[i] = buffer[i]; |
325 | } |
326 | } |
327 | }; |
328 | |
329 | static constexpr uint16_t _____ = 0u; |
330 | static constexpr uint16_t kHalf = 0x80u; |
331 | |
332 | // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded |
333 | // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8 |
334 | // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits |
335 | // resulting in a 8.8 format. |
336 | // |
337 | // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This |
338 | // system is setup to minimize the number of multiplies needed. |
339 | // |
340 | // Explanation: |
341 | // Blurring a specific mask value is given by the following equation where D_n is the resulting |
342 | // mask value and S_n is the source value. The example below is for a filter with a radius of 1 |
343 | // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are |
344 | // aligned. The filter is given by G_n where n is the symmetric filter value. |
345 | // |
346 | // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1]. |
347 | // |
348 | // We can start the source index at an offset relative to the destination separated by the |
349 | // radius. This results in a non-traditional restating of the above filter. |
350 | // |
351 | // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1] |
352 | // |
353 | // If we look at three specific consecutive destinations the following equations result: |
354 | // |
355 | // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1] |
356 | // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1] |
357 | // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1]. |
358 | // |
359 | // In the above equations, notice that S[7] is used in all three. In particular, two values are |
360 | // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and |
361 | // D[7]. |
362 | // |
363 | // From the point of view of a source value we end up with the following three equations. |
364 | // |
365 | // Given S[7]: |
366 | // D[5] += S[7]*G[1] |
367 | // D[6] += S[7]*G[0] |
368 | // D[7] += S[7]*G[1] |
369 | // |
370 | // In General: |
371 | // D[n] += S[n]*G[1] |
372 | // D[n+1] += S[n]*G[0] |
373 | // D[n+2] += S[n]*G[1] |
374 | // |
375 | // Now these equations can be ganged using SIMD to form: |
376 | // D[n..n+7] += S[n..n+7]*G[1] |
377 | // D[n+1..n+8] += S[n..n+7]*G[0] |
378 | // D[n+2..n+9] += S[n..n+7]*G[1] |
379 | // The next set of values becomes. |
380 | // D[n+8..n+15] += S[n+8..n+15]*G[1] |
381 | // D[n+9..n+16] += S[n+8..n+15]*G[0] |
382 | // D[n+10..n+17] += S[n+8..n+15]*G[1] |
383 | // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both |
384 | // S[n..7] and S[n+8..n+15]. |
385 | // |
386 | // Just one more transformation allows the code to maintain all working values in |
387 | // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is |
388 | // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}. |
389 | // |
390 | // D[n..n+7] += S[n..n+7] * G[1] |
391 | // D[n..n+8] += {0, S[n..n+7] * G[0]} |
392 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
393 | // |
394 | // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a |
395 | // register d8. In addition, S[0..n+7] becomes s0. |
396 | // |
397 | // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below. |
398 | // |
399 | // Sk8h v0 = s0*G[0] |
400 | // Sk8h v1 = s0*G[1] |
401 | // /* D[n..n+7] += S[n..n+7] * G[1] */ |
402 | // d0 += v1; |
403 | // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */ |
404 | // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]} |
405 | // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____} |
406 | // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */ |
407 | // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]} |
408 | // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} |
409 | // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. |
410 | |
411 | static void blur_x_radius_1( |
412 | const Sk8h& s0, |
413 | const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, |
414 | Sk8h* d0, Sk8h* d8) { |
415 | |
416 | auto v1 = s0.mulHi(g1); |
417 | auto v0 = s0.mulHi(g0); |
418 | |
419 | // D[n..n+7] += S[n..n+7] * G[1] |
420 | *d0 += v1; |
421 | |
422 | //D[n..n+8] += {0, S[n..n+7] * G[0]} |
423 | *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; |
424 | *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____}; |
425 | |
426 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
427 | *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; |
428 | *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; |
429 | |
430 | } |
431 | |
432 | static void blur_x_radius_2( |
433 | const Sk8h& s0, |
434 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, |
435 | Sk8h* d0, Sk8h* d8) { |
436 | auto v0 = s0.mulHi(g0); |
437 | auto v1 = s0.mulHi(g1); |
438 | auto v2 = s0.mulHi(g2); |
439 | |
440 | // D[n..n+7] += S[n..n+7] * G[2] |
441 | *d0 += v2; |
442 | |
443 | // D[n..n+8] += {0, S[n..n+7] * G[1]} |
444 | *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; |
445 | *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____}; |
446 | |
447 | // D[n..n+9] += {0, 0, S[n..n+7] * G[0]} |
448 | *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; |
449 | *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; |
450 | |
451 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} |
452 | *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; |
453 | *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; |
454 | |
455 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]} |
456 | *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; |
457 | *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; |
458 | } |
459 | |
460 | static void blur_x_radius_3( |
461 | const Sk8h& s0, |
462 | const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, |
463 | Sk8h* d0, Sk8h* d8) { |
464 | auto v0 = s0.mulHi(gauss0); |
465 | auto v1 = s0.mulHi(gauss1); |
466 | auto v2 = s0.mulHi(gauss2); |
467 | auto v3 = s0.mulHi(gauss3); |
468 | |
469 | // D[n..n+7] += S[n..n+7] * G[3] |
470 | *d0 += v3; |
471 | |
472 | // D[n..n+8] += {0, S[n..n+7] * G[2]} |
473 | *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; |
474 | *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____}; |
475 | |
476 | // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} |
477 | *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; |
478 | *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; |
479 | |
480 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]} |
481 | *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; |
482 | *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; |
483 | |
484 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]} |
485 | *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; |
486 | *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; |
487 | |
488 | // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]} |
489 | *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; |
490 | *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; |
491 | |
492 | // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} |
493 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; |
494 | *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; |
495 | } |
496 | |
497 | static void blur_x_radius_4( |
498 | const Sk8h& s0, |
499 | const Sk8h& gauss0, |
500 | const Sk8h& gauss1, |
501 | const Sk8h& gauss2, |
502 | const Sk8h& gauss3, |
503 | const Sk8h& gauss4, |
504 | Sk8h* d0, Sk8h* d8) { |
505 | auto v0 = s0.mulHi(gauss0); |
506 | auto v1 = s0.mulHi(gauss1); |
507 | auto v2 = s0.mulHi(gauss2); |
508 | auto v3 = s0.mulHi(gauss3); |
509 | auto v4 = s0.mulHi(gauss4); |
510 | |
511 | // D[n..n+7] += S[n..n+7] * G[4] |
512 | *d0 += v4; |
513 | |
514 | // D[n..n+8] += {0, S[n..n+7] * G[3]} |
515 | *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; |
516 | *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____}; |
517 | |
518 | // D[n..n+9] += {0, 0, S[n..n+7] * G[2]} |
519 | *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; |
520 | *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; |
521 | |
522 | // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} |
523 | *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; |
524 | *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; |
525 | |
526 | // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]} |
527 | *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; |
528 | *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; |
529 | |
530 | // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]} |
531 | *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; |
532 | *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; |
533 | |
534 | // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]} |
535 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; |
536 | *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; |
537 | |
538 | // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} |
539 | *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]}; |
540 | *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; |
541 | |
542 | // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]} |
543 | *d8 += v4; |
544 | } |
545 | |
546 | using BlurX = decltype(blur_x_radius_1); |
547 | |
548 | // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). |
549 | static void blur_row( |
550 | BlurX blur, |
551 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
552 | const uint8_t* src, int srcW, |
553 | uint8_t* dst, int dstW) { |
554 | // Clear the buffer to handle summing wider than source. |
555 | Sk8h d0{kHalf}, d8{kHalf}; |
556 | |
557 | // Go by multiples of 8 in src. |
558 | int x = 0; |
559 | for (; x <= srcW - 8; x += 8) { |
560 | blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8); |
561 | |
562 | store(dst, d0, 8); |
563 | |
564 | d0 = d8; |
565 | d8 = Sk8h{kHalf}; |
566 | |
567 | src += 8; |
568 | dst += 8; |
569 | } |
570 | |
571 | // There are src values left, but the remainder of src values is not a multiple of 8. |
572 | int srcTail = srcW - x; |
573 | if (srcTail > 0) { |
574 | |
575 | blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8); |
576 | |
577 | int dstTail = std::min(8, dstW - x); |
578 | store(dst, d0, dstTail); |
579 | |
580 | d0 = d8; |
581 | dst += dstTail; |
582 | x += dstTail; |
583 | } |
584 | |
585 | // There are dst mask values to complete. |
586 | int dstTail = dstW - x; |
587 | if (dstTail > 0) { |
588 | store(dst, d0, dstTail); |
589 | } |
590 | } |
591 | |
592 | // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). |
593 | static void blur_x_rect(BlurX blur, |
594 | uint16_t* gauss, |
595 | const uint8_t* src, size_t srcStride, int srcW, |
596 | uint8_t* dst, size_t dstStride, int dstW, int dstH) { |
597 | |
598 | Sk8h g0{gauss[0]}, |
599 | g1{gauss[1]}, |
600 | g2{gauss[2]}, |
601 | g3{gauss[3]}, |
602 | g4{gauss[4]}; |
603 | |
604 | // Blur *ALL* the rows. |
605 | for (int y = 0; y < dstH; y++) { |
606 | blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW); |
607 | src += srcStride; |
608 | dst += dstStride; |
609 | } |
610 | } |
611 | |
612 | static void direct_blur_x(int radius, uint16_t* gauss, |
613 | const uint8_t* src, size_t srcStride, int srcW, |
614 | uint8_t* dst, size_t dstStride, int dstW, int dstH) { |
615 | |
616 | switch (radius) { |
617 | case 1: |
618 | blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
619 | break; |
620 | |
621 | case 2: |
622 | blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
623 | break; |
624 | |
625 | case 3: |
626 | blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
627 | break; |
628 | |
629 | case 4: |
630 | blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); |
631 | break; |
632 | |
633 | default: |
634 | SkASSERTF(false, "The radius %d is not handled\n" , radius); |
635 | } |
636 | } |
637 | |
638 | // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N |
639 | // functions, but end up being simpler because there is no complicated shift of registers. We |
640 | // start with the non-traditional form of the gaussian filter. In the following r is the value |
641 | // when added generates the next value in the column. |
642 | // |
643 | // D[n+0r] = S[n+0r]*G[1] |
644 | // + S[n+1r]*G[0] |
645 | // + S[n+2r]*G[1] |
646 | // |
647 | // Expanding out in a way similar to blur_x_radius_N for specific values of n. |
648 | // |
649 | // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1] |
650 | // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1] |
651 | // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1] |
652 | // |
653 | // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we |
654 | // can look at the calculation form the point of view of a source value. |
655 | // |
656 | // Given S[n+0r]: |
657 | // D[n+0r] += S[n+0r]*G[1]; |
658 | // /* D[n+0r] is done and can be stored now. */ |
659 | // D[n+1r] += S[n+0r]*G[0]; |
660 | // D[n+2r] = S[n+0r]*G[1]; |
661 | // |
662 | // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in |
663 | // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also, |
664 | // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice |
665 | // how values flow in the following two iterations in source. |
666 | // |
667 | // D[n+0r] += S[n+0r]*G[1] |
668 | // D[n+1r] += S[n+0r]*G[0] |
669 | // D[n+2r] = S[n+0r]*G[1] |
670 | // /* ------- */ |
671 | // D[n+1r] += S[n+1r]*G[1] |
672 | // D[n+2r] += S[n+1r]*G[0] |
673 | // D[n+3r] = S[n+1r]*G[1] |
674 | // |
675 | // Instead of using memory we can introduce temporaries d01 and d12. The update step changes |
676 | // to the following. |
677 | // |
678 | // answer = d01 + S[n+0r]*G[1] |
679 | // d01 = d12 + S[n+0r]*G[0] |
680 | // d12 = S[n+0r]*G[1] |
681 | // return answer |
682 | // |
683 | // Finally, this can be ganged into SIMD style. |
684 | // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1] |
685 | // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] |
686 | // d12[0..7] = S[n+0r..n+0r+7]*G[1] |
687 | // return answer[0..7] |
688 | static Sk8h blur_y_radius_1( |
689 | const Sk8h& s0, |
690 | const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, |
691 | Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { |
692 | auto v0 = s0.mulHi(g0); |
693 | auto v1 = s0.mulHi(g1); |
694 | |
695 | Sk8h answer = *d01 + v1; |
696 | *d01 = *d12 + v0; |
697 | *d12 = v1 + kHalf; |
698 | |
699 | return answer; |
700 | } |
701 | |
702 | static Sk8h blur_y_radius_2( |
703 | const Sk8h& s0, |
704 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, |
705 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { |
706 | auto v0 = s0.mulHi(g0); |
707 | auto v1 = s0.mulHi(g1); |
708 | auto v2 = s0.mulHi(g2); |
709 | |
710 | Sk8h answer = *d01 + v2; |
711 | *d01 = *d12 + v1; |
712 | *d12 = *d23 + v0; |
713 | *d23 = *d34 + v1; |
714 | *d34 = v2 + kHalf; |
715 | |
716 | return answer; |
717 | } |
718 | |
719 | static Sk8h blur_y_radius_3( |
720 | const Sk8h& s0, |
721 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, |
722 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { |
723 | auto v0 = s0.mulHi(g0); |
724 | auto v1 = s0.mulHi(g1); |
725 | auto v2 = s0.mulHi(g2); |
726 | auto v3 = s0.mulHi(g3); |
727 | |
728 | Sk8h answer = *d01 + v3; |
729 | *d01 = *d12 + v2; |
730 | *d12 = *d23 + v1; |
731 | *d23 = *d34 + v0; |
732 | *d34 = *d45 + v1; |
733 | *d45 = *d56 + v2; |
734 | *d56 = v3 + kHalf; |
735 | |
736 | return answer; |
737 | } |
738 | |
739 | static Sk8h blur_y_radius_4( |
740 | const Sk8h& s0, |
741 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
742 | Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { |
743 | auto v0 = s0.mulHi(g0); |
744 | auto v1 = s0.mulHi(g1); |
745 | auto v2 = s0.mulHi(g2); |
746 | auto v3 = s0.mulHi(g3); |
747 | auto v4 = s0.mulHi(g4); |
748 | |
749 | Sk8h answer = *d01 + v4; |
750 | *d01 = *d12 + v3; |
751 | *d12 = *d23 + v2; |
752 | *d23 = *d34 + v1; |
753 | *d34 = *d45 + v0; |
754 | *d45 = *d56 + v1; |
755 | *d56 = *d67 + v2; |
756 | *d67 = *d78 + v3; |
757 | *d78 = v4 + kHalf; |
758 | |
759 | return answer; |
760 | } |
761 | |
762 | using BlurY = decltype(blur_y_radius_1); |
763 | |
764 | // BlurY will be one of blur_y_radius_(1|2|3|4). |
765 | static void blur_column( |
766 | ToA8 toA8, |
767 | BlurY blur, int radius, int width, |
768 | const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, |
769 | const uint8_t* src, size_t srcRB, int srcH, |
770 | uint8_t* dst, size_t dstRB) { |
771 | Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf}, |
772 | d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf}; |
773 | |
774 | auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) { |
775 | store(to, v0, width); |
776 | to += dstRB; |
777 | store(to, v1, width); |
778 | return to + dstRB; |
779 | }; |
780 | |
781 | for (int y = 0; y < srcH; y += 1) { |
782 | auto s = load(src, width, toA8); |
783 | auto b = blur(s, |
784 | g0, g1, g2, g3, g4, |
785 | &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78); |
786 | store(dst, b, width); |
787 | src += srcRB; |
788 | dst += dstRB; |
789 | } |
790 | |
791 | if (radius >= 1) { |
792 | dst = flush(dst, d01, d12); |
793 | } |
794 | if (radius >= 2) { |
795 | dst = flush(dst, d23, d34); |
796 | } |
797 | if (radius >= 3) { |
798 | dst = flush(dst, d45, d56); |
799 | } |
800 | if (radius >= 4) { |
801 | flush(dst, d67, d78); |
802 | } |
803 | } |
804 | |
805 | // BlurY will be one of blur_y_radius_(1|2|3|4). |
806 | static void blur_y_rect(ToA8 toA8, const int strideOf8, |
807 | BlurY blur, int radius, uint16_t *gauss, |
808 | const uint8_t *src, size_t srcRB, int srcW, int srcH, |
809 | uint8_t *dst, size_t dstRB) { |
810 | |
811 | Sk8h g0{gauss[0]}, |
812 | g1{gauss[1]}, |
813 | g2{gauss[2]}, |
814 | g3{gauss[3]}, |
815 | g4{gauss[4]}; |
816 | |
817 | int x = 0; |
818 | for (; x <= srcW - 8; x += 8) { |
819 | blur_column(toA8, blur, radius, 8, |
820 | g0, g1, g2, g3, g4, |
821 | src, srcRB, srcH, |
822 | dst, dstRB); |
823 | src += strideOf8; |
824 | dst += 8; |
825 | } |
826 | |
827 | int xTail = srcW - x; |
828 | if (xTail > 0) { |
829 | blur_column(toA8, blur, radius, xTail, |
830 | g0, g1, g2, g3, g4, |
831 | src, srcRB, srcH, |
832 | dst, dstRB); |
833 | } |
834 | } |
835 | |
836 | static void direct_blur_y(ToA8 toA8, const int strideOf8, |
837 | int radius, uint16_t* gauss, |
838 | const uint8_t* src, size_t srcRB, int srcW, int srcH, |
839 | uint8_t* dst, size_t dstRB) { |
840 | |
841 | switch (radius) { |
842 | case 1: |
843 | blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss, |
844 | src, srcRB, srcW, srcH, |
845 | dst, dstRB); |
846 | break; |
847 | |
848 | case 2: |
849 | blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss, |
850 | src, srcRB, srcW, srcH, |
851 | dst, dstRB); |
852 | break; |
853 | |
854 | case 3: |
855 | blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss, |
856 | src, srcRB, srcW, srcH, |
857 | dst, dstRB); |
858 | break; |
859 | |
860 | case 4: |
861 | blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss, |
862 | src, srcRB, srcW, srcH, |
863 | dst, dstRB); |
864 | break; |
865 | |
866 | default: |
867 | SkASSERTF(false, "The radius %d is not handled\n" , radius); |
868 | } |
869 | } |
870 | |
871 | static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) { |
872 | SkASSERT(sigmaX == sigmaY); // TODO |
873 | SkASSERT(0.01 <= sigmaX && sigmaX < 2); |
874 | SkASSERT(0.01 <= sigmaY && sigmaY < 2); |
875 | |
876 | SkGaussFilter filterX{sigmaX}, |
877 | filterY{sigmaY}; |
878 | |
879 | int radiusX = filterX.radius(), |
880 | radiusY = filterY.radius(); |
881 | |
882 | SkASSERT(radiusX <= 4 && radiusY <= 4); |
883 | |
884 | auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) { |
885 | int i = 0; |
886 | for (double d : filter) { |
887 | factors[i++] = static_cast<uint16_t>(round(d * (1 << 16))); |
888 | } |
889 | }; |
890 | |
891 | uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax], |
892 | gaussFactorsY[SkGaussFilter::kGaussArrayMax]; |
893 | |
894 | prepareGauss(filterX, gaussFactorsX); |
895 | prepareGauss(filterY, gaussFactorsY); |
896 | |
897 | *dst = SkMask::PrepareDestination(radiusX, radiusY, src); |
898 | if (src.fImage == nullptr) { |
899 | return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)}; |
900 | } |
901 | if (dst->fImage == nullptr) { |
902 | dst->fBounds.setEmpty(); |
903 | return {0, 0}; |
904 | } |
905 | |
906 | int srcW = src.fBounds.width(), |
907 | srcH = src.fBounds.height(); |
908 | |
909 | int dstW = dst->fBounds.width(), |
910 | dstH = dst->fBounds.height(); |
911 | |
912 | size_t srcRB = src.fRowBytes, |
913 | dstRB = dst->fRowBytes; |
914 | |
915 | //TODO: handle bluring in only one direction. |
916 | |
917 | // Blur vertically and copy to destination. |
918 | switch (src.fFormat) { |
919 | case SkMask::kBW_Format: |
920 | direct_blur_y(bw_to_a8, 1, |
921 | radiusY, gaussFactorsY, |
922 | src.fImage, srcRB, srcW, srcH, |
923 | dst->fImage + radiusX, dstRB); |
924 | break; |
925 | case SkMask::kA8_Format: |
926 | direct_blur_y(nullptr, 8, |
927 | radiusY, gaussFactorsY, |
928 | src.fImage, srcRB, srcW, srcH, |
929 | dst->fImage + radiusX, dstRB); |
930 | break; |
931 | case SkMask::kARGB32_Format: |
932 | direct_blur_y(argb32_to_a8, 32, |
933 | radiusY, gaussFactorsY, |
934 | src.fImage, srcRB, srcW, srcH, |
935 | dst->fImage + radiusX, dstRB); |
936 | break; |
937 | case SkMask::kLCD16_Format: |
938 | direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY, |
939 | src.fImage, srcRB, srcW, srcH, |
940 | dst->fImage + radiusX, dstRB); |
941 | break; |
942 | default: |
943 | SK_ABORT("Unhandled format." ); |
944 | } |
945 | |
946 | // Blur horizontally in place. |
947 | direct_blur_x(radiusX, gaussFactorsX, |
948 | dst->fImage + radiusX, dstRB, srcW, |
949 | dst->fImage, dstRB, dstW, dstH); |
950 | |
951 | return {radiusX, radiusY}; |
952 | } |
953 | |
954 | // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the |
955 | // API forces the sigmas to be the same. |
956 | SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { |
957 | |
958 | if (fSigmaW < 2.0 && fSigmaH < 2.0) { |
959 | return small_blur(fSigmaW, fSigmaH, src, dst); |
960 | } |
961 | |
962 | // 1024 is a place holder guess until more analysis can be done. |
963 | SkSTArenaAlloc<1024> alloc; |
964 | |
965 | PlanGauss planW(fSigmaW); |
966 | PlanGauss planH(fSigmaH); |
967 | |
968 | int borderW = planW.border(), |
969 | borderH = planH.border(); |
970 | SkASSERT(borderH >= 0 && borderW >= 0); |
971 | |
972 | *dst = SkMask::PrepareDestination(borderW, borderH, src); |
973 | if (src.fImage == nullptr) { |
974 | return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; |
975 | } |
976 | if (dst->fImage == nullptr) { |
977 | dst->fBounds.setEmpty(); |
978 | return {0, 0}; |
979 | } |
980 | |
981 | int srcW = src.fBounds.width(), |
982 | srcH = src.fBounds.height(), |
983 | dstW = dst->fBounds.width(), |
984 | dstH = dst->fBounds.height(); |
985 | SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0); |
986 | |
987 | auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize()); |
988 | auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize); |
989 | |
990 | // Blur both directions. |
991 | int tmpW = srcH, |
992 | tmpH = dstW; |
993 | |
994 | auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH); |
995 | |
996 | // Blur horizontally, and transpose. |
997 | const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer); |
998 | switch (src.fFormat) { |
999 | case SkMask::kBW_Format: { |
1000 | const uint8_t* bwStart = src.fImage; |
1001 | auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0); |
1002 | auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8); |
1003 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
1004 | auto tmpStart = &tmp[y]; |
1005 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
1006 | } |
1007 | } break; |
1008 | case SkMask::kA8_Format: { |
1009 | const uint8_t* a8Start = src.fImage; |
1010 | auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start); |
1011 | auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW); |
1012 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
1013 | auto tmpStart = &tmp[y]; |
1014 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
1015 | } |
1016 | } break; |
1017 | case SkMask::kARGB32_Format: { |
1018 | const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage); |
1019 | auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart); |
1020 | auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW); |
1021 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
1022 | auto tmpStart = &tmp[y]; |
1023 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
1024 | } |
1025 | } break; |
1026 | case SkMask::kLCD16_Format: { |
1027 | const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage); |
1028 | auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart); |
1029 | auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW); |
1030 | for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) { |
1031 | auto tmpStart = &tmp[y]; |
1032 | scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH); |
1033 | } |
1034 | } break; |
1035 | default: |
1036 | SK_ABORT("Unhandled format." ); |
1037 | } |
1038 | |
1039 | // Blur vertically (scan in memory order because of the transposition), |
1040 | // and transpose back to the original orientation. |
1041 | const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer); |
1042 | for (int y = 0; y < tmpH; y++) { |
1043 | auto tmpStart = &tmp[y * tmpW]; |
1044 | auto dstStart = &dst->fImage[y]; |
1045 | |
1046 | scanH.blur(tmpStart, tmpStart + tmpW, |
1047 | dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); |
1048 | } |
1049 | |
1050 | return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; |
1051 | } |
1052 | |