1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_TARGET_OPT_AVX2 |
9 | |
10 | #include "./blgradient_p.h" |
11 | #include "./blmath_p.h" |
12 | #include "./blsimd_p.h" |
13 | #include "./blsupport_p.h" |
14 | |
15 | // ============================================================================ |
16 | // [BLGradientOps - InterpolateLut32 [AVX2]] |
17 | // ============================================================================ |
18 | |
19 | void BL_CDECL blGradientInterpolate32_AVX2(uint32_t* dPtr, uint32_t dWidth, const BLGradientStop* sPtr, size_t sSize) noexcept { |
20 | using namespace SIMD; |
21 | |
22 | BL_ASSERT(dPtr != nullptr); |
23 | BL_ASSERT(dWidth > 0); |
24 | |
25 | BL_ASSERT(sPtr != nullptr); |
26 | BL_ASSERT(sSize > 0); |
27 | |
28 | uint32_t* dSpanPtr = dPtr; |
29 | uint32_t i = dWidth; |
30 | |
31 | I128 c0 = vloadi128_64(&sPtr[0].rgba); |
32 | I128 c1; |
33 | |
34 | I128 half = vseti128i32(1 << (23 - 1)); |
35 | I256 argb64_a255 = vseti256u64(0x00FF000000000000u); |
36 | |
37 | uint32_t u0 = 0; |
38 | uint32_t u1; |
39 | |
40 | size_t sIndex = size_t(sPtr[0].offset == 0.0 && sSize > 1); |
41 | double fWidth = double(int32_t(--dWidth) << 8); |
42 | |
43 | do { |
44 | c1 = vloadi128_64(&sPtr[sIndex].rgba); |
45 | u1 = uint32_t(blRoundToInt(sPtr[sIndex].offset * fWidth)); |
46 | |
47 | dSpanPtr = dPtr + (u0 >> 8); |
48 | i = ((u1 >> 8) - (u0 >> 8)); |
49 | u0 = u1; |
50 | |
51 | if (i <= 1) { |
52 | I128 cPix = vunpackli64(c0, c1); |
53 | c0 = c1; |
54 | cPix = vsrli16<8>(cPix); |
55 | |
56 | I128 cA = vswizi16<3, 3, 3, 3>(cPix); |
57 | cPix = vor(cPix, vcast<I128>(argb64_a255)); |
58 | cPix = vdiv255u16(vmuli16(cPix, cA)); |
59 | cPix = vpacki16u8(cPix); |
60 | vstorei32(dSpanPtr, cPix); |
61 | dSpanPtr++; |
62 | |
63 | if (i == 0) |
64 | continue; |
65 | |
66 | cPix = vswizi32<1, 1, 1, 1>(cPix); |
67 | vstorei32(dSpanPtr, cPix); |
68 | dSpanPtr++; |
69 | } |
70 | else { |
71 | I256 dx; |
72 | |
73 | // Scale `dx` by taking advantage of DP-FP division. |
74 | { |
75 | I128 cx; |
76 | D128 scale = vdivsd(vcvtd64d128(1 << 23), vcvti32d128(int(i))); |
77 | |
78 | c0 = vunpackli8(c0, c0); |
79 | cx = vunpackli8(c1, c1); |
80 | |
81 | c0 = vsrli32<24>(c0); |
82 | cx = vsrli32<24>(cx); |
83 | cx = vsubi32(cx, c0); |
84 | c0 = vslli32<23>(c0); |
85 | |
86 | dx = vdupli128(vcvttd256i128(vmulpd(vcvti128d256(cx), vsplatd64d256(scale)))); |
87 | } |
88 | |
89 | c0 = vaddi32(c0, half); |
90 | uint32_t n = i + 1; |
91 | |
92 | if (n >= 8) { |
93 | I256 cx = vaddi32(vdupli128(c0), vpermi128<0, -1>(vcast<I256>(vslli32<2>(dx)))); |
94 | I256 dx5 = vaddi32(vslli32<2>(dx), dx); |
95 | |
96 | while (n >= 16) { |
97 | I256 p40 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
98 | I256 p51 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
99 | I256 p5410 = vpacki32i16(p40, p51); |
100 | |
101 | I256 p62 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
102 | I256 p73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5); |
103 | I256 p7632 = vpacki32i16(p62, p73); |
104 | |
105 | I256 q40 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
106 | I256 q51 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
107 | I256 q5410 = vpacki32i16(q40, q51); |
108 | |
109 | I256 q62 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
110 | I256 q73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5); |
111 | I256 q7632 = vpacki32i16(q62, q73); |
112 | |
113 | p5410 = vmulu16(vor(p5410, argb64_a255), vswizi16<3, 3, 3, 3>(p5410)); |
114 | p7632 = vmulu16(vor(p7632, argb64_a255), vswizi16<3, 3, 3, 3>(p7632)); |
115 | q5410 = vmulu16(vor(q5410, argb64_a255), vswizi16<3, 3, 3, 3>(q5410)); |
116 | q7632 = vmulu16(vor(q7632, argb64_a255), vswizi16<3, 3, 3, 3>(q7632)); |
117 | |
118 | p5410 = vdiv255u16(p5410); |
119 | p7632 = vdiv255u16(p7632); |
120 | q5410 = vdiv255u16(q5410); |
121 | q7632 = vdiv255u16(q7632); |
122 | |
123 | I256 pp = vpacki16u8(p5410, p7632); |
124 | I256 qp = vpacki16u8(q5410, q7632); |
125 | |
126 | vstorei256u(dSpanPtr + 0, pp); |
127 | vstorei256u(dSpanPtr + 8, qp); |
128 | |
129 | n -= 16; |
130 | dSpanPtr += 16; |
131 | } |
132 | |
133 | while (n >= 8) { |
134 | I256 p40 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
135 | I256 p51 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
136 | I256 p5410 = vpacki32i16(p40, p51); |
137 | |
138 | I256 p62 = vsrli32<23>(cx); cx = vaddi32(cx, dx); |
139 | I256 p73 = vsrli32<23>(cx); cx = vaddi32(cx, dx5); |
140 | I256 p7632 = vpacki32i16(p62, p73); |
141 | |
142 | p5410 = vmulu16(vor(p5410, argb64_a255), vswizi16<3, 3, 3, 3>(p5410)); |
143 | p7632 = vmulu16(vor(p7632, argb64_a255), vswizi16<3, 3, 3, 3>(p7632)); |
144 | |
145 | p5410 = vdiv255u16(p5410); |
146 | p7632 = vdiv255u16(p7632); |
147 | |
148 | I256 pp = vpacki16u8(p5410, p7632); |
149 | vstorei256u(dSpanPtr, pp); |
150 | |
151 | n -= 8; |
152 | dSpanPtr += 8; |
153 | } |
154 | |
155 | c0 = vcast<I128>(cx); |
156 | } |
157 | |
158 | while (n >= 2) { |
159 | I128 p0 = vsrli32<23>(c0); c0 = vaddi32(c0, vcast<I128>(dx)); |
160 | I128 p1 = vsrli32<23>(c0); c0 = vaddi32(c0, vcast<I128>(dx)); |
161 | |
162 | p0 = vpacki32i16(p0, p1); |
163 | p0 = vdiv255u16(vmuli16(vor(p0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(p0))); |
164 | |
165 | p0 = vpacki16u8(p0); |
166 | vstorei64(dSpanPtr, p0); |
167 | |
168 | n -= 2; |
169 | dSpanPtr += 2; |
170 | } |
171 | |
172 | if (n) { |
173 | I128 p0 = vsrli32<23>(c0); |
174 | c0 = vaddi32(c0, vcast<I128>(dx)); |
175 | |
176 | p0 = vpacki32i16(p0, p0); |
177 | p0 = vdiv255u16(vmuli16(vor(p0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(p0))); |
178 | |
179 | p0 = vpacki16u8(p0); |
180 | vstorei32(dSpanPtr, p0); |
181 | |
182 | dSpanPtr++; |
183 | } |
184 | |
185 | c0 = c1; |
186 | } |
187 | } while (++sIndex < sSize); |
188 | |
189 | // The last stop doesn't have to end at 1.0, in such case the remaining space |
190 | // is filled by the last color stop (premultiplied). |
191 | { |
192 | i = uint32_t((size_t)((dPtr + dWidth + 1) - dSpanPtr)); |
193 | |
194 | c0 = vloadi128_h64(c0, &sPtr[0].rgba); |
195 | c0 = vsrli16<8>(c0); |
196 | |
197 | c0 = vdiv255u16(vmuli16(vor(c0, vcast<I128>(argb64_a255)), vswizi16<3, 3, 3, 3>(c0))); |
198 | c0 = vpacki16u8(c0); |
199 | c1 = c0; |
200 | } |
201 | |
202 | if (i != 0) { |
203 | do { |
204 | vstorei32(dSpanPtr, c0); |
205 | dSpanPtr++; |
206 | } while (--i); |
207 | } |
208 | |
209 | // The first pixel has to be always set to the first stop's color. The main |
210 | // loop always honors the last color value of the stop colliding with the |
211 | // previous offset index - for example if multiple stops have the same offset |
212 | // [0.0] the first pixel will be the last stop's color. This is easier to fix |
213 | // here as we don't need extra conditions in the main loop. |
214 | vstorei32(dPtr, vswizi32<1, 1, 1, 1>(c1)); |
215 | } |
216 | |
217 | #endif |
218 | |