1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_TARGET_OPT_AVX2
9
10#include "./blgradient_p.h"
11#include "./blmath_p.h"
12#include "./blsimd_p.h"
13#include "./blsupport_p.h"
14
15// ============================================================================
16// [BLGradientOps - InterpolateLut32 [AVX2]]
17// ============================================================================
18
19void BL_CDECL blGradientInterpolate32_AVX2(uint32_t* dPtr, uint32_t dWidth, const BLGradientStop* sPtr, size_t sSize) noexcept {
20 using namespace SIMD;
21
22 BL_ASSERT(dPtr != nullptr);
23 BL_ASSERT(dWidth > 0);
24
25 BL_ASSERT(sPtr != nullptr);
26 BL_ASSERT(sSize > 0);
27
28 uint32_t* dSpanPtr = dPtr;
29 uint32_t i = dWidth;
30
31 I128 c0 = vloadi128_64(&sPtr[0].rgba);
32 I128 c1;
33
34 I128 half = vseti128i32(1 << (23 - 1));
35 I256 argb64_a255 = vseti256u64(0x00FF000000000000u);
36
37 uint32_t u0 = 0;
38 uint32_t u1;
39
40 size_t sIndex = size_t(sPtr[0].offset == 0.0 && sSize > 1);
41 double fWidth = double(int32_t(--dWidth) << 8);
42
43 do {
44 c1 = vloadi128_64(&sPtr[sIndex].rgba);
45 u1 = uint32_t(blRoundToInt(sPtr[sIndex].offset * fWidth));
46
47 dSpanPtr = dPtr + (u0 >> 8);
48 i = ((u1 >> 8) - (u0 >> 8));
49 u0 = u1;
50
51 if (i <= 1) {
52 I128 cPix = vunpackli64(c0, c1);
53 c0 = c1;
54 cPix = vsrli16<8>(cPix);
55
56 I128 cA = vswizi16<3, 3, 3, 3>(cPix);
57 cPix = vor(cPix, vcast<I128>(argb64_a255));
58 cPix = vdiv255u16(vmuli16(cPix, cA));
59 cPix = vpacki16u8(cPix);
60 vstorei32(dSpanPtr, cPix);
61 dSpanPtr++;
62
63 if (i == 0)
64 continue;
65
66 cPix = vswizi32<1, 1, 1, 1>(cPix);
67 vstorei32(dSpanPtr, cPix);
68 dSpanPtr++;
69 }
70 else {
71 I256 dx;
72
73 // Scale `dx` by taking advantage of DP-FP division.
74 {
75 I128 cx;
76 D128 scale = vdivsd(vcvtd64d128(1 << 23), vcvti32d128(int(i)));
77
78 c0 = vunpackli8(c0, c0);
79 cx = vunpackli8(c1, c1);
80
81 c0 = vsrli32<24>(c0);
82 cx = vsrli32<24>(cx);
83 cx = vsubi32(cx, c0);
84 c0 = vslli32<23>(c0);
85
86 dx = vdupli128(vcvttd256i128(vmulpd(vcvti128d256(cx), vsplatd64d256(scale))));
87 }
88
89 c0 = vaddi32(c0, half);
90 uint32_t n = i + 1;
91
92 if (n >= 8) {
93 I256 cx = vaddi32(vdupli128(c0), vpermi128<0, -1>(vcast<I256>(vslli32<2>(dx))));
94 I256 dx5 = vaddi32(vslli32<2>(dx), dx);
95
96 while (n >= 16) {
97 I256 p40 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
98 I256 p51 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
99 I256 p5410 = vpacki32i16(p40, p51);
100
101 I256 p62 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
102 I256 p73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5);
103 I256 p7632 = vpacki32i16(p62, p73);
104
105 I256 q40 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
106 I256 q51 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
107 I256 q5410 = vpacki32i16(q40, q51);
108
109 I256 q62 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
110 I256 q73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5);
111 I256 q7632 = vpacki32i16(q62, q73);
112
113 p5410 = vmulu16(vor(p5410, argb64_a255), vswizi16<3, 3, 3, 3>(p5410));
114 p7632 = vmulu16(vor(p7632, argb64_a255), vswizi16<3, 3, 3, 3>(p7632));
115 q5410 = vmulu16(vor(q5410, argb64_a255), vswizi16<3, 3, 3, 3>(q5410));
116 q7632 = vmulu16(vor(q7632, argb64_a255), vswizi16<3, 3, 3, 3>(q7632));
117
118 p5410 = vdiv255u16(p5410);
119 p7632 = vdiv255u16(p7632);
120 q5410 = vdiv255u16(q5410);
121 q7632 = vdiv255u16(q7632);
122
123 I256 pp = vpacki16u8(p5410, p7632);
124 I256 qp = vpacki16u8(q5410, q7632);
125
126 vstorei256u(dSpanPtr + 0, pp);
127 vstorei256u(dSpanPtr + 8, qp);
128
129 n -= 16;
130 dSpanPtr += 16;
131 }
132
133 while (n >= 8) {
134 I256 p40 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
135 I256 p51 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
136 I256 p5410 = vpacki32i16(p40, p51);
137
138 I256 p62 = vsrli32<23>(cx); cx = vaddi32(cx, dx);
139 I256 p73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5);
140 I256 p7632 = vpacki32i16(p62, p73);
141
142 p5410 = vmulu16(vor(p5410, argb64_a255), vswizi16<3, 3, 3, 3>(p5410));
143 p7632 = vmulu16(vor(p7632, argb64_a255), vswizi16<3, 3, 3, 3>(p7632));
144
145 p5410 = vdiv255u16(p5410);
146 p7632 = vdiv255u16(p7632);
147
148 I256 pp = vpacki16u8(p5410, p7632);
149 vstorei256u(dSpanPtr, pp);
150
151 n -= 8;
152 dSpanPtr += 8;
153 }
154
155 c0 = vcast<I128>(cx);
156 }
157
158 while (n >= 2) {
159 I128 p0 = vsrli32<23>(c0); c0 = vaddi32(c0, vcast<I128>(dx));
160 I128 p1 = vsrli32<23>(c0); c0 = vaddi32(c0, vcast<I128>(dx));
161
162 p0 = vpacki32i16(p0, p1);
163 p0 = vdiv255u16(vmuli16(vor(p0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(p0)));
164
165 p0 = vpacki16u8(p0);
166 vstorei64(dSpanPtr, p0);
167
168 n -= 2;
169 dSpanPtr += 2;
170 }
171
172 if (n) {
173 I128 p0 = vsrli32<23>(c0);
174 c0 = vaddi32(c0, vcast<I128>(dx));
175
176 p0 = vpacki32i16(p0, p0);
177 p0 = vdiv255u16(vmuli16(vor(p0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(p0)));
178
179 p0 = vpacki16u8(p0);
180 vstorei32(dSpanPtr, p0);
181
182 dSpanPtr++;
183 }
184
185 c0 = c1;
186 }
187 } while (++sIndex < sSize);
188
189 // The last stop doesn't have to end at 1.0, in such case the remaining space
190 // is filled by the last color stop (premultiplied).
191 {
192 i = uint32_t((size_t)((dPtr + dWidth + 1) - dSpanPtr));
193
194 c0 = vloadi128_h64(c0, &sPtr[0].rgba);
195 c0 = vsrli16<8>(c0);
196
197 c0 = vdiv255u16(vmuli16(vor(c0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(c0)));
198 c0 = vpacki16u8(c0);
199 c1 = c0;
200 }
201
202 if (i != 0) {
203 do {
204 vstorei32(dSpanPtr, c0);
205 dSpanPtr++;
206 } while (--i);
207 }
208
209 // The first pixel has to be always set to the first stop's color. The main
210 // loop always honors the last color value of the stop colliding with the
211 // previous offset index - for example if multiple stops have the same offset
212 // [0.0] the first pixel will be the last stop's color. This is easier to fix
213 // here as we don't need extra conditions in the main loop.
214 vstorei32(dPtr, vswizi32<1, 1, 1, 1>(c1));
215}
216
217#endif
218