1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_TARGET_OPT_SSE2
9
10#include "./blgradient_p.h"
11#include "./blmath_p.h"
12#include "./blsimd_p.h"
13#include "./blsupport_p.h"
14
15// ============================================================================
16// [BLGradientOps - Interpolate32@SSE2]
17// ============================================================================
18
19void BL_CDECL blGradientInterpolate32_SSE2(uint32_t* dPtr, uint32_t dSize, const BLGradientStop* sPtr, size_t sSize) noexcept {
20 using namespace SIMD;
21
22 BL_ASSERT(dPtr != nullptr);
23 BL_ASSERT(dSize > 0);
24
25 BL_ASSERT(sPtr != nullptr);
26 BL_ASSERT(sSize > 0);
27
28 uint32_t* dSpanPtr = dPtr;
29 uint32_t i = dSize;
30
31 I128 c0 = vloadi128_64(&sPtr[0].rgba);
32 I128 c1;
33
34 I128 half = vseti128i32(1 << (23 - 1));
35 I128 argb64_a255 = vseti128u64(0x00FF000000000000u);
36
37 uint32_t p0 = 0;
38 uint32_t p1;
39
40 size_t sIndex = size_t(sPtr[0].offset == 0.0 && sSize > 1);
41 double fWidth = double(int32_t(--dSize) << 8);
42
43 do {
44 c1 = vloadi128_64(&sPtr[sIndex].rgba);
45 p1 = uint32_t(blRoundToInt(sPtr[sIndex].offset * fWidth));
46
47 dSpanPtr = dPtr + (p0 >> 8);
48 i = ((p1 >> 8) - (p0 >> 8));
49 p0 = p1;
50
51 if (i <= 1) {
52 I128 cPix = vunpackli64(c0, c1);
53 c0 = c1;
54 cPix = vsrli16<8>(cPix);
55
56 I128 cA = vswizi16<3, 3, 3, 3>(cPix);
57 cPix = vor(cPix, argb64_a255);
58 cPix = vdiv255u16(vmuli16(cPix, cA));
59 cPix = vpacki16u8(cPix);
60 vstorei32(dSpanPtr, cPix);
61 dSpanPtr++;
62
63 if (i == 0)
64 continue;
65
66 cPix = vswizi32<1, 1, 1, 1>(cPix);
67 vstorei32(dSpanPtr, cPix);
68 dSpanPtr++;
69 }
70 else {
71 BL_SIMD_LOOP_32x4_INIT()
72
73 I128 cD;
74
75 // Scale `cD` by taking advantage of SSE2-FP division.
76 {
77 D128 scale = vdivsd(vcvtd64d128(1 << 23), vcvti32d128(int(i)));
78
79 c0 = vunpackli8(c0, c0);
80 cD = vunpackli8(c1, c1);
81
82 c0 = vsrli32<24>(c0);
83 cD = vsrli32<24>(cD);
84 cD = vsubi32(cD, c0);
85 c0 = vslli32<23>(c0);
86
87 D128 lo = vcvti128d128(cD);
88 cD = vswapi64(cD);
89 scale = vdupld64(scale);
90
91 D128 hi = vcvti128d128(cD);
92 lo = vmulpd(lo, scale);
93 hi = vmulpd(hi, scale);
94
95 cD = vcvttd128i128(lo);
96 cD = vunpackli64(cD, vcvttd128i128(hi));
97 }
98
99 c0 = vaddi32(c0, half);
100 i++;
101
102 BL_SIMD_LOOP_32x4_MINI_BEGIN(Loop, dSpanPtr, i)
103 I128 cPix, cA;
104
105 cPix = vsrli32<23>(c0);
106 c0 = vaddi32(c0, cD);
107
108 cPix = vpacki32i16(cPix, cPix);
109 cA = vswizi16<3, 3, 3, 3>(cPix);
110 cPix = vor(cPix, argb64_a255);
111 cPix = vdiv255u16(vmuli16(cPix, cA));
112 cPix = vpacki16u8(cPix);
113 vstorei32(dSpanPtr, cPix);
114
115 dSpanPtr++;
116 BL_SIMD_LOOP_32x4_MINI_END(Loop)
117
118 BL_SIMD_LOOP_32x4_MAIN_BEGIN(Loop)
119 I128 cPix0, cA0;
120 I128 cPix1, cA1;
121
122 cPix0 = vsrli32<23>(c0);
123 c0 = vaddi32(c0, cD);
124
125 cPix1 = vsrli32<23>(c0);
126 c0 = vaddi32(c0, cD);
127 cPix0 = vpacki32i16(cPix0, cPix1);
128
129 cPix1 = vsrli32<23>(c0);
130 c0 = vaddi32(c0, cD);
131
132 cA0 = vsrli32<23>(c0);
133 c0 = vaddi32(c0, cD);
134 cPix1 = vpacki32i16(cPix1, cA0);
135
136 cA0 = vswizi16<3, 3, 3, 3>(cPix0);
137 cA1 = vswizi16<3, 3, 3, 3>(cPix1);
138
139 cPix0 = vor(cPix0, argb64_a255);
140 cPix1 = vor(cPix1, argb64_a255);
141
142 cPix0 = vdiv255u16(vmuli16(cPix0, cA0));
143 cPix1 = vdiv255u16(vmuli16(cPix1, cA1));
144
145 cPix0 = vpacki16u8(cPix0, cPix1);
146 vstorei128a(dSpanPtr, cPix0);
147
148 dSpanPtr += 4;
149 BL_SIMD_LOOP_32x4_MAIN_END(Loop)
150
151 c0 = c1;
152 }
153 } while (++sIndex < sSize);
154
155 // The last stop doesn't have to end at 1.0, in such case the remaining space
156 // is filled by the last color stop (premultiplied).
157 {
158 I128 cA;
159 i = uint32_t((size_t)((dPtr + dSize + 1) - dSpanPtr));
160
161 c0 = vloadi128_h64(c0, &sPtr[0].rgba);
162 c0 = vsrli16<8>(c0);
163
164 cA = vswizi16<3, 3, 3, 3>(c0);
165 c0 = vor(c0, argb64_a255);
166 c0 = vdiv255u16(vmuli16(c0, cA));
167 c0 = vpacki16u8(c0);
168 c1 = c0;
169 }
170
171 if (i != 0) {
172 do {
173 vstorei32(dSpanPtr, c0);
174 dSpanPtr++;
175 } while (--i);
176 }
177
178 // The first pixel has to be always set to the first stop's color. The main
179 // loop always honors the last color value of the stop colliding with the
180 // previous offset index - for example if multiple stops have the same offset
181 // [0.0] the first pixel will be the last stop's color. This is easier to fix
182 // here as we don't need extra conditions in the main loop.
183 vstorei32(dPtr, vswizi32<1, 1, 1, 1>(c1));
184}
185
186#endif
187