1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "src/core/SkMaskBlurFilter.h"
9
10#include "include/core/SkColorPriv.h"
11#include "include/private/SkMalloc.h"
12#include "include/private/SkNx.h"
13#include "include/private/SkTemplates.h"
14#include "include/private/SkTo.h"
15#include "src/core/SkArenaAlloc.h"
16#include "src/core/SkGaussFilter.h"
17
18#include <cmath>
19#include <climits>
20
21namespace {
22static const double kPi = 3.14159265358979323846264338327950288;
23
24class PlanGauss final {
25public:
26 explicit PlanGauss(double sigma) {
27 auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
28 auto window = std::max(1, possibleWindow);
29
30 fPass0Size = window - 1;
31 fPass1Size = window - 1;
32 fPass2Size = (window & 1) == 1 ? window - 1 : window;
33
34 // Calculating the border is tricky. I will go through the odd case which is simpler, and
35 // then through the even case. Given a stack of filters seven wide for the odd case of
36 // three passes.
37 //
38 // S
39 // aaaAaaa
40 // bbbBbbb
41 // cccCccc
42 // D
43 //
44 // The furthest changed pixel is when the filters are in the following configuration.
45 //
46 // S
47 // aaaAaaa
48 // bbbBbbb
49 // cccCccc
50 // D
51 //
52 // The A pixel is calculated using the value S, the B uses A, and the C uses B, and
53 // finally D is C. So, with a window size of seven the border is nine. In general, the
54 // border is 3*((window - 1)/2).
55 //
56 // For even cases the filter stack is more complicated. The spec specifies two passes
57 // of even filters and a final pass of odd filters. A stack for a width of six looks like
58 // this.
59 //
60 // S
61 // aaaAaa
62 // bbBbbb
63 // cccCccc
64 // D
65 //
66 // The furthest pixel looks like this.
67 //
68 // S
69 // aaaAaa
70 // bbBbbb
71 // cccCccc
72 // D
73 //
74 // For a window of size, the border value is seven. In general the border is 3 *
75 // (window/2) -1.
76 fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
77 fSlidingWindow = 2 * fBorder + 1;
78
79 // If the window is odd then the divisor is just window ^ 3 otherwise,
80 // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
81 auto window2 = window * window;
82 auto window3 = window2 * window;
83 auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
84
85 fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
86 }
87
88 size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; }
89
90 int border() const { return fBorder; }
91
92public:
93 class Scan {
94 public:
95 Scan(uint64_t weight, int noChangeCount,
96 uint32_t* buffer0, uint32_t* buffer0End,
97 uint32_t* buffer1, uint32_t* buffer1End,
98 uint32_t* buffer2, uint32_t* buffer2End)
99 : fWeight{weight}
100 , fNoChangeCount{noChangeCount}
101 , fBuffer0{buffer0}
102 , fBuffer0End{buffer0End}
103 , fBuffer1{buffer1}
104 , fBuffer1End{buffer1End}
105 , fBuffer2{buffer2}
106 , fBuffer2End{buffer2End}
107 { }
108
109 template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd,
110 uint8_t* dst, int dstStride, uint8_t* dstEnd) const {
111 auto buffer0Cursor = fBuffer0;
112 auto buffer1Cursor = fBuffer1;
113 auto buffer2Cursor = fBuffer2;
114
115 std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
116
117 uint32_t sum0 = 0;
118 uint32_t sum1 = 0;
119 uint32_t sum2 = 0;
120
121 // Consume the source generating pixels.
122 for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) {
123 uint32_t leadingEdge = *src;
124 sum0 += leadingEdge;
125 sum1 += sum0;
126 sum2 += sum1;
127
128 *dst = this->finalScale(sum2);
129
130 sum2 -= *buffer2Cursor;
131 *buffer2Cursor = sum1;
132 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
133
134 sum1 -= *buffer1Cursor;
135 *buffer1Cursor = sum0;
136 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
137
138 sum0 -= *buffer0Cursor;
139 *buffer0Cursor = leadingEdge;
140 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
141 }
142
143 // The leading edge is off the right side of the mask.
144 for (int i = 0; i < fNoChangeCount; i++) {
145 uint32_t leadingEdge = 0;
146 sum0 += leadingEdge;
147 sum1 += sum0;
148 sum2 += sum1;
149
150 *dst = this->finalScale(sum2);
151
152 sum2 -= *buffer2Cursor;
153 *buffer2Cursor = sum1;
154 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
155
156 sum1 -= *buffer1Cursor;
157 *buffer1Cursor = sum0;
158 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
159
160 sum0 -= *buffer0Cursor;
161 *buffer0Cursor = leadingEdge;
162 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
163
164 dst += dstStride;
165 }
166
167 // Starting from the right, fill in the rest of the buffer.
168 std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
169
170 sum0 = sum1 = sum2 = 0;
171
172 uint8_t* dstCursor = dstEnd;
173 AlphaIter src = srcEnd;
174 while (dstCursor > dst) {
175 dstCursor -= dstStride;
176 uint32_t leadingEdge = *(--src);
177 sum0 += leadingEdge;
178 sum1 += sum0;
179 sum2 += sum1;
180
181 *dstCursor = this->finalScale(sum2);
182
183 sum2 -= *buffer2Cursor;
184 *buffer2Cursor = sum1;
185 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
186
187 sum1 -= *buffer1Cursor;
188 *buffer1Cursor = sum0;
189 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
190
191 sum0 -= *buffer0Cursor;
192 *buffer0Cursor = leadingEdge;
193 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
194 }
195 }
196
197 private:
198 static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
199
200 uint8_t finalScale(uint32_t sum) const {
201 return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
202 }
203
204 uint64_t fWeight;
205 int fNoChangeCount;
206 uint32_t* fBuffer0;
207 uint32_t* fBuffer0End;
208 uint32_t* fBuffer1;
209 uint32_t* fBuffer1End;
210 uint32_t* fBuffer2;
211 uint32_t* fBuffer2End;
212 };
213
214 Scan makeBlurScan(int width, uint32_t* buffer) const {
215 uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
216 buffer0 = buffer;
217 buffer0End = buffer1 = buffer0 + fPass0Size;
218 buffer1End = buffer2 = buffer1 + fPass1Size;
219 buffer2End = buffer2 + fPass2Size;
220 int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
221
222 return Scan(
223 fWeight, noChangeCount,
224 buffer0, buffer0End,
225 buffer1, buffer1End,
226 buffer2, buffer2End);
227 }
228
229 uint64_t fWeight;
230 int fBorder;
231 int fSlidingWindow;
232 int fPass0Size;
233 int fPass1Size;
234 int fPass2Size;
235};
236
237} // namespace
238
239// NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
240// using the Gauss filter. It also limits the size of buffers used hold intermediate values. The
241// additional + 1 added to window represents adding one more leading element before subtracting the
242// trailing element.
243// Explanation of maximums:
244// sum0 = (window + 1) * 255
245// sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255
246// sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255
247//
248// The value (window + 1)^3 * 255 must fit in a uint32_t. So,
249// (window + 1)^3 * 255 < 2^32. window = 255.
250//
251// window = floor(sigma * 3 * sqrt(2 * kPi) / 4)
252// For window <= 255, the largest value for sigma is 135.
253SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
254 : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)}
255 , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)}
256{
257 SkASSERT(sigmaW >= 0);
258 SkASSERT(sigmaH >= 0);
259}
260
261bool SkMaskBlurFilter::hasNoBlur() const {
262 return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
263}
264
265// We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
266// Each of these converts width (up to 8) mask values to A8.
267static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) {
268 SkASSERT(0 < width && width <= 8);
269
270 uint8_t masks = *from;
271 for (int i = 0; i < width; ++i) {
272 a8[i] = (masks >> (7 - i)) & 1 ? 0xFF
273 : 0x00;
274 }
275}
276static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) {
277 SkASSERT(0 < width && width <= 8);
278
279 for (int i = 0; i < width; ++i) {
280 unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i],
281 r = SkPacked16ToR32(rgb),
282 g = SkPacked16ToG32(rgb),
283 b = SkPacked16ToB32(rgb);
284 a8[i] = (r + g + b) / 3;
285 }
286}
287static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
288 SkASSERT(0 < width && width <= 8);
289 for (int i = 0; i < width; ++i) {
290 uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i];
291 a8[i] = SkGetPackedA32(rgba);
292 }
293}
294using ToA8 = decltype(bw_to_a8);
295
296static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
297 // Our fast path is a full 8-byte load of A8.
298 // So we'll conditionally handle the two slow paths using tmp:
299 // - if we have a function to convert another mask to A8, use it;
300 // - if not but we have less than 8 bytes to load, load them one at a time.
301 uint8_t tmp[8] = {0,0,0,0, 0,0,0,0};
302 if (toA8) {
303 toA8(tmp, from, width);
304 from = tmp;
305 } else if (width < 8) {
306 for (int i = 0; i < width; ++i) {
307 tmp[i] = from[i];
308 }
309 from = tmp;
310 }
311
312 // Load A8 and convert to 8.8 fixed-point.
313 return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
314}
315
316static void store(uint8_t* to, const Sk8h& v, int width) {
317 Sk8b b = SkNx_cast<uint8_t>(v >> 8);
318 if (width == 8) {
319 b.store(to);
320 } else {
321 uint8_t buffer[8];
322 b.store(buffer);
323 for (int i = 0; i < width; i++) {
324 to[i] = buffer[i];
325 }
326 }
327};
328
329static constexpr uint16_t _____ = 0u;
330static constexpr uint16_t kHalf = 0x80u;
331
332// In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
333// in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
334// format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
335// resulting in a 8.8 format.
336//
337// The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
338// system is setup to minimize the number of multiplies needed.
339//
340// Explanation:
341// Blurring a specific mask value is given by the following equation where D_n is the resulting
342// mask value and S_n is the source value. The example below is for a filter with a radius of 1
343// and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
344// aligned. The filter is given by G_n where n is the symmetric filter value.
345//
346// D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
347//
348// We can start the source index at an offset relative to the destination separated by the
349// radius. This results in a non-traditional restating of the above filter.
350//
351// D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
352//
353// If we look at three specific consecutive destinations the following equations result:
354//
355// D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
356// D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
357// D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
358//
359// In the above equations, notice that S[7] is used in all three. In particular, two values are
360// used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
361// D[7].
362//
363// From the point of view of a source value we end up with the following three equations.
364//
365// Given S[7]:
366// D[5] += S[7]*G[1]
367// D[6] += S[7]*G[0]
368// D[7] += S[7]*G[1]
369//
370// In General:
371// D[n] += S[n]*G[1]
372// D[n+1] += S[n]*G[0]
373// D[n+2] += S[n]*G[1]
374//
375// Now these equations can be ganged using SIMD to form:
376// D[n..n+7] += S[n..n+7]*G[1]
377// D[n+1..n+8] += S[n..n+7]*G[0]
378// D[n+2..n+9] += S[n..n+7]*G[1]
379// The next set of values becomes.
380// D[n+8..n+15] += S[n+8..n+15]*G[1]
381// D[n+9..n+16] += S[n+8..n+15]*G[0]
382// D[n+10..n+17] += S[n+8..n+15]*G[1]
383// You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
384// S[n..7] and S[n+8..n+15].
385//
386// Just one more transformation allows the code to maintain all working values in
387// registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
388// prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
389//
390// D[n..n+7] += S[n..n+7] * G[1]
391// D[n..n+8] += {0, S[n..n+7] * G[0]}
392// D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
393//
394// Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
395// register d8. In addition, S[0..n+7] becomes s0.
396//
397// The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
398//
399// Sk8h v0 = s0*G[0]
400// Sk8h v1 = s0*G[1]
401// /* D[n..n+7] += S[n..n+7] * G[1] */
402// d0 += v1;
403// /* D[n..n+8] += {0, S[n..n+7] * G[0]} */
404// d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
405// d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
406// /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */
407// d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
408// d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
409// Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
410
411static void blur_x_radius_1(
412 const Sk8h& s0,
413 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
414 Sk8h* d0, Sk8h* d8) {
415
416 auto v1 = s0.mulHi(g1);
417 auto v0 = s0.mulHi(g0);
418
419 // D[n..n+7] += S[n..n+7] * G[1]
420 *d0 += v1;
421
422 //D[n..n+8] += {0, S[n..n+7] * G[0]}
423 *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
424 *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
425
426 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
427 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
428 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
429
430}
431
432static void blur_x_radius_2(
433 const Sk8h& s0,
434 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
435 Sk8h* d0, Sk8h* d8) {
436 auto v0 = s0.mulHi(g0);
437 auto v1 = s0.mulHi(g1);
438 auto v2 = s0.mulHi(g2);
439
440 // D[n..n+7] += S[n..n+7] * G[2]
441 *d0 += v2;
442
443 // D[n..n+8] += {0, S[n..n+7] * G[1]}
444 *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
445 *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
446
447 // D[n..n+9] += {0, 0, S[n..n+7] * G[0]}
448 *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
449 *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
450
451 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
452 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
453 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
454
455 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]}
456 *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
457 *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
458}
459
460static void blur_x_radius_3(
461 const Sk8h& s0,
462 const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
463 Sk8h* d0, Sk8h* d8) {
464 auto v0 = s0.mulHi(gauss0);
465 auto v1 = s0.mulHi(gauss1);
466 auto v2 = s0.mulHi(gauss2);
467 auto v3 = s0.mulHi(gauss3);
468
469 // D[n..n+7] += S[n..n+7] * G[3]
470 *d0 += v3;
471
472 // D[n..n+8] += {0, S[n..n+7] * G[2]}
473 *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
474 *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
475
476 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
477 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
478 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
479
480 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]}
481 *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
482 *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
483
484 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]}
485 *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
486 *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
487
488 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
489 *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
490 *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
491
492 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
493 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
494 *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
495}
496
497static void blur_x_radius_4(
498 const Sk8h& s0,
499 const Sk8h& gauss0,
500 const Sk8h& gauss1,
501 const Sk8h& gauss2,
502 const Sk8h& gauss3,
503 const Sk8h& gauss4,
504 Sk8h* d0, Sk8h* d8) {
505 auto v0 = s0.mulHi(gauss0);
506 auto v1 = s0.mulHi(gauss1);
507 auto v2 = s0.mulHi(gauss2);
508 auto v3 = s0.mulHi(gauss3);
509 auto v4 = s0.mulHi(gauss4);
510
511 // D[n..n+7] += S[n..n+7] * G[4]
512 *d0 += v4;
513
514 // D[n..n+8] += {0, S[n..n+7] * G[3]}
515 *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
516 *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
517
518 // D[n..n+9] += {0, 0, S[n..n+7] * G[2]}
519 *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
520 *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
521
522 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
523 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
524 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
525
526 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]}
527 *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
528 *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
529
530 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
531 *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
532 *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
533
534 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
535 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
536 *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
537
538 // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
539 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
540 *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
541
542 // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
543 *d8 += v4;
544}
545
546using BlurX = decltype(blur_x_radius_1);
547
548// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
549static void blur_row(
550 BlurX blur,
551 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
552 const uint8_t* src, int srcW,
553 uint8_t* dst, int dstW) {
554 // Clear the buffer to handle summing wider than source.
555 Sk8h d0{kHalf}, d8{kHalf};
556
557 // Go by multiples of 8 in src.
558 int x = 0;
559 for (; x <= srcW - 8; x += 8) {
560 blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
561
562 store(dst, d0, 8);
563
564 d0 = d8;
565 d8 = Sk8h{kHalf};
566
567 src += 8;
568 dst += 8;
569 }
570
571 // There are src values left, but the remainder of src values is not a multiple of 8.
572 int srcTail = srcW - x;
573 if (srcTail > 0) {
574
575 blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
576
577 int dstTail = std::min(8, dstW - x);
578 store(dst, d0, dstTail);
579
580 d0 = d8;
581 dst += dstTail;
582 x += dstTail;
583 }
584
585 // There are dst mask values to complete.
586 int dstTail = dstW - x;
587 if (dstTail > 0) {
588 store(dst, d0, dstTail);
589 }
590}
591
592// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
593static void blur_x_rect(BlurX blur,
594 uint16_t* gauss,
595 const uint8_t* src, size_t srcStride, int srcW,
596 uint8_t* dst, size_t dstStride, int dstW, int dstH) {
597
598 Sk8h g0{gauss[0]},
599 g1{gauss[1]},
600 g2{gauss[2]},
601 g3{gauss[3]},
602 g4{gauss[4]};
603
604 // Blur *ALL* the rows.
605 for (int y = 0; y < dstH; y++) {
606 blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
607 src += srcStride;
608 dst += dstStride;
609 }
610}
611
612static void direct_blur_x(int radius, uint16_t* gauss,
613 const uint8_t* src, size_t srcStride, int srcW,
614 uint8_t* dst, size_t dstStride, int dstW, int dstH) {
615
616 switch (radius) {
617 case 1:
618 blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
619 break;
620
621 case 2:
622 blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
623 break;
624
625 case 3:
626 blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
627 break;
628
629 case 4:
630 blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
631 break;
632
633 default:
634 SkASSERTF(false, "The radius %d is not handled\n", radius);
635 }
636}
637
638// The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
639// functions, but end up being simpler because there is no complicated shift of registers. We
640// start with the non-traditional form of the gaussian filter. In the following r is the value
641// when added generates the next value in the column.
642//
643// D[n+0r] = S[n+0r]*G[1]
644// + S[n+1r]*G[0]
645// + S[n+2r]*G[1]
646//
647// Expanding out in a way similar to blur_x_radius_N for specific values of n.
648//
649// D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
650// D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
651// D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
652//
653// We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
654// can look at the calculation form the point of view of a source value.
655//
656// Given S[n+0r]:
657// D[n+0r] += S[n+0r]*G[1];
658// /* D[n+0r] is done and can be stored now. */
659// D[n+1r] += S[n+0r]*G[0];
660// D[n+2r] = S[n+0r]*G[1];
661//
662// Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
663// S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
664// notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
665// how values flow in the following two iterations in source.
666//
667// D[n+0r] += S[n+0r]*G[1]
668// D[n+1r] += S[n+0r]*G[0]
669// D[n+2r] = S[n+0r]*G[1]
670// /* ------- */
671// D[n+1r] += S[n+1r]*G[1]
672// D[n+2r] += S[n+1r]*G[0]
673// D[n+3r] = S[n+1r]*G[1]
674//
675// Instead of using memory we can introduce temporaries d01 and d12. The update step changes
676// to the following.
677//
678// answer = d01 + S[n+0r]*G[1]
679// d01 = d12 + S[n+0r]*G[0]
680// d12 = S[n+0r]*G[1]
681// return answer
682//
683// Finally, this can be ganged into SIMD style.
684// answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
685// d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0]
686// d12[0..7] = S[n+0r..n+0r+7]*G[1]
687// return answer[0..7]
688static Sk8h blur_y_radius_1(
689 const Sk8h& s0,
690 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
691 Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
692 auto v0 = s0.mulHi(g0);
693 auto v1 = s0.mulHi(g1);
694
695 Sk8h answer = *d01 + v1;
696 *d01 = *d12 + v0;
697 *d12 = v1 + kHalf;
698
699 return answer;
700}
701
702static Sk8h blur_y_radius_2(
703 const Sk8h& s0,
704 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
705 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
706 auto v0 = s0.mulHi(g0);
707 auto v1 = s0.mulHi(g1);
708 auto v2 = s0.mulHi(g2);
709
710 Sk8h answer = *d01 + v2;
711 *d01 = *d12 + v1;
712 *d12 = *d23 + v0;
713 *d23 = *d34 + v1;
714 *d34 = v2 + kHalf;
715
716 return answer;
717}
718
719static Sk8h blur_y_radius_3(
720 const Sk8h& s0,
721 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
722 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
723 auto v0 = s0.mulHi(g0);
724 auto v1 = s0.mulHi(g1);
725 auto v2 = s0.mulHi(g2);
726 auto v3 = s0.mulHi(g3);
727
728 Sk8h answer = *d01 + v3;
729 *d01 = *d12 + v2;
730 *d12 = *d23 + v1;
731 *d23 = *d34 + v0;
732 *d34 = *d45 + v1;
733 *d45 = *d56 + v2;
734 *d56 = v3 + kHalf;
735
736 return answer;
737}
738
739static Sk8h blur_y_radius_4(
740 const Sk8h& s0,
741 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
742 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
743 auto v0 = s0.mulHi(g0);
744 auto v1 = s0.mulHi(g1);
745 auto v2 = s0.mulHi(g2);
746 auto v3 = s0.mulHi(g3);
747 auto v4 = s0.mulHi(g4);
748
749 Sk8h answer = *d01 + v4;
750 *d01 = *d12 + v3;
751 *d12 = *d23 + v2;
752 *d23 = *d34 + v1;
753 *d34 = *d45 + v0;
754 *d45 = *d56 + v1;
755 *d56 = *d67 + v2;
756 *d67 = *d78 + v3;
757 *d78 = v4 + kHalf;
758
759 return answer;
760}
761
762using BlurY = decltype(blur_y_radius_1);
763
764// BlurY will be one of blur_y_radius_(1|2|3|4).
765static void blur_column(
766 ToA8 toA8,
767 BlurY blur, int radius, int width,
768 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
769 const uint8_t* src, size_t srcRB, int srcH,
770 uint8_t* dst, size_t dstRB) {
771 Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
772 d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
773
774 auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
775 store(to, v0, width);
776 to += dstRB;
777 store(to, v1, width);
778 return to + dstRB;
779 };
780
781 for (int y = 0; y < srcH; y += 1) {
782 auto s = load(src, width, toA8);
783 auto b = blur(s,
784 g0, g1, g2, g3, g4,
785 &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
786 store(dst, b, width);
787 src += srcRB;
788 dst += dstRB;
789 }
790
791 if (radius >= 1) {
792 dst = flush(dst, d01, d12);
793 }
794 if (radius >= 2) {
795 dst = flush(dst, d23, d34);
796 }
797 if (radius >= 3) {
798 dst = flush(dst, d45, d56);
799 }
800 if (radius >= 4) {
801 flush(dst, d67, d78);
802 }
803}
804
805// BlurY will be one of blur_y_radius_(1|2|3|4).
806static void blur_y_rect(ToA8 toA8, const int strideOf8,
807 BlurY blur, int radius, uint16_t *gauss,
808 const uint8_t *src, size_t srcRB, int srcW, int srcH,
809 uint8_t *dst, size_t dstRB) {
810
811 Sk8h g0{gauss[0]},
812 g1{gauss[1]},
813 g2{gauss[2]},
814 g3{gauss[3]},
815 g4{gauss[4]};
816
817 int x = 0;
818 for (; x <= srcW - 8; x += 8) {
819 blur_column(toA8, blur, radius, 8,
820 g0, g1, g2, g3, g4,
821 src, srcRB, srcH,
822 dst, dstRB);
823 src += strideOf8;
824 dst += 8;
825 }
826
827 int xTail = srcW - x;
828 if (xTail > 0) {
829 blur_column(toA8, blur, radius, xTail,
830 g0, g1, g2, g3, g4,
831 src, srcRB, srcH,
832 dst, dstRB);
833 }
834}
835
836static void direct_blur_y(ToA8 toA8, const int strideOf8,
837 int radius, uint16_t* gauss,
838 const uint8_t* src, size_t srcRB, int srcW, int srcH,
839 uint8_t* dst, size_t dstRB) {
840
841 switch (radius) {
842 case 1:
843 blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss,
844 src, srcRB, srcW, srcH,
845 dst, dstRB);
846 break;
847
848 case 2:
849 blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss,
850 src, srcRB, srcW, srcH,
851 dst, dstRB);
852 break;
853
854 case 3:
855 blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss,
856 src, srcRB, srcW, srcH,
857 dst, dstRB);
858 break;
859
860 case 4:
861 blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss,
862 src, srcRB, srcW, srcH,
863 dst, dstRB);
864 break;
865
866 default:
867 SkASSERTF(false, "The radius %d is not handled\n", radius);
868 }
869}
870
871static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
872 SkASSERT(sigmaX == sigmaY); // TODO
873 SkASSERT(0.01 <= sigmaX && sigmaX < 2);
874 SkASSERT(0.01 <= sigmaY && sigmaY < 2);
875
876 SkGaussFilter filterX{sigmaX},
877 filterY{sigmaY};
878
879 int radiusX = filterX.radius(),
880 radiusY = filterY.radius();
881
882 SkASSERT(radiusX <= 4 && radiusY <= 4);
883
884 auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
885 int i = 0;
886 for (double d : filter) {
887 factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
888 }
889 };
890
891 uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
892 gaussFactorsY[SkGaussFilter::kGaussArrayMax];
893
894 prepareGauss(filterX, gaussFactorsX);
895 prepareGauss(filterY, gaussFactorsY);
896
897 *dst = SkMask::PrepareDestination(radiusX, radiusY, src);
898 if (src.fImage == nullptr) {
899 return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
900 }
901 if (dst->fImage == nullptr) {
902 dst->fBounds.setEmpty();
903 return {0, 0};
904 }
905
906 int srcW = src.fBounds.width(),
907 srcH = src.fBounds.height();
908
909 int dstW = dst->fBounds.width(),
910 dstH = dst->fBounds.height();
911
912 size_t srcRB = src.fRowBytes,
913 dstRB = dst->fRowBytes;
914
915 //TODO: handle bluring in only one direction.
916
917 // Blur vertically and copy to destination.
918 switch (src.fFormat) {
919 case SkMask::kBW_Format:
920 direct_blur_y(bw_to_a8, 1,
921 radiusY, gaussFactorsY,
922 src.fImage, srcRB, srcW, srcH,
923 dst->fImage + radiusX, dstRB);
924 break;
925 case SkMask::kA8_Format:
926 direct_blur_y(nullptr, 8,
927 radiusY, gaussFactorsY,
928 src.fImage, srcRB, srcW, srcH,
929 dst->fImage + radiusX, dstRB);
930 break;
931 case SkMask::kARGB32_Format:
932 direct_blur_y(argb32_to_a8, 32,
933 radiusY, gaussFactorsY,
934 src.fImage, srcRB, srcW, srcH,
935 dst->fImage + radiusX, dstRB);
936 break;
937 case SkMask::kLCD16_Format:
938 direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY,
939 src.fImage, srcRB, srcW, srcH,
940 dst->fImage + radiusX, dstRB);
941 break;
942 default:
943 SK_ABORT("Unhandled format.");
944 }
945
946 // Blur horizontally in place.
947 direct_blur_x(radiusX, gaussFactorsX,
948 dst->fImage + radiusX, dstRB, srcW,
949 dst->fImage, dstRB, dstW, dstH);
950
951 return {radiusX, radiusY};
952}
953
954// TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
955// API forces the sigmas to be the same.
956SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
957
958 if (fSigmaW < 2.0 && fSigmaH < 2.0) {
959 return small_blur(fSigmaW, fSigmaH, src, dst);
960 }
961
962 // 1024 is a place holder guess until more analysis can be done.
963 SkSTArenaAlloc<1024> alloc;
964
965 PlanGauss planW(fSigmaW);
966 PlanGauss planH(fSigmaH);
967
968 int borderW = planW.border(),
969 borderH = planH.border();
970 SkASSERT(borderH >= 0 && borderW >= 0);
971
972 *dst = SkMask::PrepareDestination(borderW, borderH, src);
973 if (src.fImage == nullptr) {
974 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
975 }
976 if (dst->fImage == nullptr) {
977 dst->fBounds.setEmpty();
978 return {0, 0};
979 }
980
981 int srcW = src.fBounds.width(),
982 srcH = src.fBounds.height(),
983 dstW = dst->fBounds.width(),
984 dstH = dst->fBounds.height();
985 SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
986
987 auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize());
988 auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
989
990 // Blur both directions.
991 int tmpW = srcH,
992 tmpH = dstW;
993
994 auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
995
996 // Blur horizontally, and transpose.
997 const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer);
998 switch (src.fFormat) {
999 case SkMask::kBW_Format: {
1000 const uint8_t* bwStart = src.fImage;
1001 auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0);
1002 auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8);
1003 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1004 auto tmpStart = &tmp[y];
1005 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1006 }
1007 } break;
1008 case SkMask::kA8_Format: {
1009 const uint8_t* a8Start = src.fImage;
1010 auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start);
1011 auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW);
1012 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1013 auto tmpStart = &tmp[y];
1014 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1015 }
1016 } break;
1017 case SkMask::kARGB32_Format: {
1018 const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage);
1019 auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart);
1020 auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW);
1021 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1022 auto tmpStart = &tmp[y];
1023 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1024 }
1025 } break;
1026 case SkMask::kLCD16_Format: {
1027 const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage);
1028 auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart);
1029 auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW);
1030 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1031 auto tmpStart = &tmp[y];
1032 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1033 }
1034 } break;
1035 default:
1036 SK_ABORT("Unhandled format.");
1037 }
1038
1039 // Blur vertically (scan in memory order because of the transposition),
1040 // and transpose back to the original orientation.
1041 const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer);
1042 for (int y = 0; y < tmpH; y++) {
1043 auto tmpStart = &tmp[y * tmpW];
1044 auto dstStart = &dst->fImage[y];
1045
1046 scanH.blur(tmpStart, tmpStart + tmpW,
1047 dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
1048 }
1049
1050 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
1051}
1052