1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_TARGET_OPT_SSE2 |
9 | |
10 | #include "./blgradient_p.h" |
11 | #include "./blmath_p.h" |
12 | #include "./blsimd_p.h" |
13 | #include "./blsupport_p.h" |
14 | |
15 | // ============================================================================ |
16 | // [BLGradientOps - Interpolate32@SSE2] |
17 | // ============================================================================ |
18 | |
19 | void BL_CDECL blGradientInterpolate32_SSE2(uint32_t* dPtr, uint32_t dSize, const BLGradientStop* sPtr, size_t sSize) noexcept { |
20 | using namespace SIMD; |
21 | |
22 | BL_ASSERT(dPtr != nullptr); |
23 | BL_ASSERT(dSize > 0); |
24 | |
25 | BL_ASSERT(sPtr != nullptr); |
26 | BL_ASSERT(sSize > 0); |
27 | |
28 | uint32_t* dSpanPtr = dPtr; |
29 | uint32_t i = dSize; |
30 | |
31 | I128 c0 = vloadi128_64(&sPtr[0].rgba); |
32 | I128 c1; |
33 | |
34 | I128 half = vseti128i32(1 << (23 - 1)); |
35 | I128 argb64_a255 = vseti128u64(0x00FF000000000000u); |
36 | |
37 | uint32_t p0 = 0; |
38 | uint32_t p1; |
39 | |
40 | size_t sIndex = size_t(sPtr[0].offset == 0.0 && sSize > 1); |
41 | double fWidth = double(int32_t(--dSize) << 8); |
42 | |
43 | do { |
44 | c1 = vloadi128_64(&sPtr[sIndex].rgba); |
45 | p1 = uint32_t(blRoundToInt(sPtr[sIndex].offset * fWidth)); |
46 | |
47 | dSpanPtr = dPtr + (p0 >> 8); |
48 | i = ((p1 >> 8) - (p0 >> 8)); |
49 | p0 = p1; |
50 | |
51 | if (i <= 1) { |
52 | I128 cPix = vunpackli64(c0, c1); |
53 | c0 = c1; |
54 | cPix = vsrli16<8>(cPix); |
55 | |
56 | I128 cA = vswizi16<3, 3, 3, 3>(cPix); |
57 | cPix = vor(cPix, argb64_a255); |
58 | cPix = vdiv255u16(vmuli16(cPix, cA)); |
59 | cPix = vpacki16u8(cPix); |
60 | vstorei32(dSpanPtr, cPix); |
61 | dSpanPtr++; |
62 | |
63 | if (i == 0) |
64 | continue; |
65 | |
66 | cPix = vswizi32<1, 1, 1, 1>(cPix); |
67 | vstorei32(dSpanPtr, cPix); |
68 | dSpanPtr++; |
69 | } |
70 | else { |
71 | BL_SIMD_LOOP_32x4_INIT() |
72 | |
73 | I128 cD; |
74 | |
75 | // Scale `cD` by taking advantage of SSE2-FP division. |
76 | { |
77 | D128 scale = vdivsd(vcvtd64d128(1 << 23), vcvti32d128(int(i))); |
78 | |
79 | c0 = vunpackli8(c0, c0); |
80 | cD = vunpackli8(c1, c1); |
81 | |
82 | c0 = vsrli32<24>(c0); |
83 | cD = vsrli32<24>(cD); |
84 | cD = vsubi32(cD, c0); |
85 | c0 = vslli32<23>(c0); |
86 | |
87 | D128 lo = vcvti128d128(cD); |
88 | cD = vswapi64(cD); |
89 | scale = vdupld64(scale); |
90 | |
91 | D128 hi = vcvti128d128(cD); |
92 | lo = vmulpd(lo, scale); |
93 | hi = vmulpd(hi, scale); |
94 | |
95 | cD = vcvttd128i128(lo); |
96 | cD = vunpackli64(cD, vcvttd128i128(hi)); |
97 | } |
98 | |
99 | c0 = vaddi32(c0, half); |
100 | i++; |
101 | |
102 | BL_SIMD_LOOP_32x4_MINI_BEGIN(Loop, dSpanPtr, i) |
103 | I128 cPix, cA; |
104 | |
105 | cPix = vsrli32<23>(c0); |
106 | c0 = vaddi32(c0, cD); |
107 | |
108 | cPix = vpacki32i16(cPix, cPix); |
109 | cA = vswizi16<3, 3, 3, 3>(cPix); |
110 | cPix = vor(cPix, argb64_a255); |
111 | cPix = vdiv255u16(vmuli16(cPix, cA)); |
112 | cPix = vpacki16u8(cPix); |
113 | vstorei32(dSpanPtr, cPix); |
114 | |
115 | dSpanPtr++; |
116 | BL_SIMD_LOOP_32x4_MINI_END(Loop) |
117 | |
118 | BL_SIMD_LOOP_32x4_MAIN_BEGIN(Loop) |
119 | I128 cPix0, cA0; |
120 | I128 cPix1, cA1; |
121 | |
122 | cPix0 = vsrli32<23>(c0); |
123 | c0 = vaddi32(c0, cD); |
124 | |
125 | cPix1 = vsrli32<23>(c0); |
126 | c0 = vaddi32(c0, cD); |
127 | cPix0 = vpacki32i16(cPix0, cPix1); |
128 | |
129 | cPix1 = vsrli32<23>(c0); |
130 | c0 = vaddi32(c0, cD); |
131 | |
132 | cA0 = vsrli32<23>(c0); |
133 | c0 = vaddi32(c0, cD); |
134 | cPix1 = vpacki32i16(cPix1, cA0); |
135 | |
136 | cA0 = vswizi16<3, 3, 3, 3>(cPix0); |
137 | cA1 = vswizi16<3, 3, 3, 3>(cPix1); |
138 | |
139 | cPix0 = vor(cPix0, argb64_a255); |
140 | cPix1 = vor(cPix1, argb64_a255); |
141 | |
142 | cPix0 = vdiv255u16(vmuli16(cPix0, cA0)); |
143 | cPix1 = vdiv255u16(vmuli16(cPix1, cA1)); |
144 | |
145 | cPix0 = vpacki16u8(cPix0, cPix1); |
146 | vstorei128a(dSpanPtr, cPix0); |
147 | |
148 | dSpanPtr += 4; |
149 | BL_SIMD_LOOP_32x4_MAIN_END(Loop) |
150 | |
151 | c0 = c1; |
152 | } |
153 | } while (++sIndex < sSize); |
154 | |
155 | // The last stop doesn't have to end at 1.0, in such case the remaining space |
156 | // is filled by the last color stop (premultiplied). |
157 | { |
158 | I128 cA; |
159 | i = uint32_t((size_t)((dPtr + dSize + 1) - dSpanPtr)); |
160 | |
161 | c0 = vloadi128_h64(c0, &sPtr[0].rgba); |
162 | c0 = vsrli16<8>(c0); |
163 | |
164 | cA = vswizi16<3, 3, 3, 3>(c0); |
165 | c0 = vor(c0, argb64_a255); |
166 | c0 = vdiv255u16(vmuli16(c0, cA)); |
167 | c0 = vpacki16u8(c0); |
168 | c1 = c0; |
169 | } |
170 | |
171 | if (i != 0) { |
172 | do { |
173 | vstorei32(dSpanPtr, c0); |
174 | dSpanPtr++; |
175 | } while (--i); |
176 | } |
177 | |
178 | // The first pixel has to be always set to the first stop's color. The main |
179 | // loop always honors the last color value of the stop colliding with the |
180 | // previous offset index - for example if multiple stops have the same offset |
181 | // [0.0] the first pixel will be the last stop's color. This is easier to fix |
182 | // here as we don't need extra conditions in the main loop. |
183 | vstorei32(dPtr, vswizi32<1, 1, 1, 1>(c1)); |
184 | } |
185 | |
186 | #endif |
187 | |