1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_TARGET_OPT_SSE2
9
10#include "./blgeometry.h"
11#include "./blmatrix.h"
12#include "./blruntime_p.h"
13#include "./blsimd_p.h"
14#include "./blsupport_p.h"
15
16// ============================================================================
17// [BLMatrix2D - MapPointDArray [SSE2]]
18// ============================================================================
19
20static BLResult BL_CDECL blMatrix2DMapPointDArrayIdentity_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept {
21 using namespace SIMD;
22
23 BL_UNUSED(self);
24 if (dst == src)
25 return BL_SUCCESS;
26
27 size_t i = size;
28 if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) {
29 while (i >= 4) {
30 D128 s0 = vloadd128a(src + 0);
31 D128 s1 = vloadd128a(src + 1);
32 D128 s2 = vloadd128a(src + 2);
33 D128 s3 = vloadd128a(src + 3);
34
35 vstored128a(dst + 0, s0);
36 vstored128a(dst + 1, s1);
37 vstored128a(dst + 2, s2);
38 vstored128a(dst + 3, s3);
39
40 i -= 4;
41 dst += 4;
42 src += 4;
43 }
44
45 while (i) {
46 vstored128a(dst, vloadd128a(src));
47 i--;
48 dst++;
49 src++;
50 }
51 }
52 else {
53 while (i) {
54 D128 sx = vloadd128_64(&src->x);
55 D128 sy = vloadd128_64(&src->y);
56
57 vstored64(&dst->x, sx);
58 vstored64(&dst->y, sy);
59
60 i--;
61 dst++;
62 src++;
63 }
64 }
65
66 return BL_SUCCESS;
67}
68
69static BLResult BL_CDECL blMatrix2DMapPointDArrayTranslate_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept {
70 using namespace SIMD;
71
72 size_t i = size;
73 D128 m20_m21 = vloadd128u(&self->m20);
74
75 if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) {
76 while (i >= 4) {
77 D128 s0 = vloadd128a(src + 0);
78 D128 s1 = vloadd128a(src + 1);
79 D128 s2 = vloadd128a(src + 2);
80 D128 s3 = vloadd128a(src + 3);
81
82 vstored128a(dst + 0, vaddpd(s0, m20_m21));
83 vstored128a(dst + 1, vaddpd(s1, m20_m21));
84 vstored128a(dst + 2, vaddpd(s2, m20_m21));
85 vstored128a(dst + 3, vaddpd(s3, m20_m21));
86
87 i -= 4;
88 dst += 4;
89 src += 4;
90 }
91
92 while (i) {
93 D128 s0 = vloadd128a(src);
94 vstored128a(dst, vaddpd(s0, m20_m21));
95
96 i--;
97 dst++;
98 src++;
99 }
100 }
101 else {
102 while (i >= 4) {
103 D128 s0 = vloadd128u(src + 0);
104 D128 s1 = vloadd128u(src + 1);
105 D128 s2 = vloadd128u(src + 2);
106 D128 s3 = vloadd128u(src + 3);
107
108 vstored128u(dst + 0, vaddpd(s0, m20_m21));
109 vstored128u(dst + 1, vaddpd(s1, m20_m21));
110 vstored128u(dst + 2, vaddpd(s2, m20_m21));
111 vstored128u(dst + 3, vaddpd(s3, m20_m21));
112
113 i -= 4;
114 dst += 4;
115 src += 4;
116 }
117
118 while (i) {
119 D128 s0 = vloadd128u(src + 0);
120 vstored128u(dst + 0, vaddpd(s0, m20_m21));
121
122 i--;
123 dst++;
124 src++;
125 }
126 }
127
128 return BL_SUCCESS;
129}
130
131static BLResult BL_CDECL blMatrix2DMapPointDArrayScale_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept {
132 using namespace SIMD;
133
134 size_t i = size;
135 D128 m00_m11 = vsetd128(self->m11, self->m00);
136 D128 m20_m21 = vloadd128u(&self->m20);
137
138 if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) {
139 while (i >= 4) {
140 D128 s0 = vloadd128a(src + 0);
141 D128 s1 = vloadd128a(src + 1);
142 D128 s2 = vloadd128a(src + 2);
143 D128 s3 = vloadd128a(src + 3);
144
145 vstored128a(dst + 0, vaddpd(vmulpd(s0, m00_m11), m20_m21));
146 vstored128a(dst + 1, vaddpd(vmulpd(s1, m00_m11), m20_m21));
147 vstored128a(dst + 2, vaddpd(vmulpd(s2, m00_m11), m20_m21));
148 vstored128a(dst + 3, vaddpd(vmulpd(s3, m00_m11), m20_m21));
149
150 i -= 4;
151 dst += 4;
152 src += 4;
153 }
154
155 while (i) {
156 D128 s0 = vloadd128a(src);
157 vstored128a(dst, vaddpd(vmulpd(s0, m00_m11), m20_m21));
158
159 i--;
160 dst++;
161 src++;
162 }
163 }
164 else {
165 while (i >= 4) {
166 D128 s0 = vloadd128u(src + 0);
167 D128 s1 = vloadd128u(src + 1);
168 D128 s2 = vloadd128u(src + 2);
169 D128 s3 = vloadd128u(src + 3);
170
171 vstored128u(dst + 0, vaddpd(vmulpd(s0, m00_m11), m20_m21));
172 vstored128u(dst + 1, vaddpd(vmulpd(s1, m00_m11), m20_m21));
173 vstored128u(dst + 2, vaddpd(vmulpd(s2, m00_m11), m20_m21));
174 vstored128u(dst + 3, vaddpd(vmulpd(s3, m00_m11), m20_m21));
175
176 i -= 4;
177 dst += 4;
178 src += 4;
179 }
180
181 while (i) {
182 D128 s0 = vloadd128u(src);
183 vstored128u(dst, vaddpd(vmulpd(s0, m00_m11), m20_m21));
184
185 i--;
186 dst++;
187 src++;
188 }
189 }
190
191 return BL_SUCCESS;
192}
193
194static BLResult BL_CDECL blMatrix2DMapPointDArraySwap_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept {
195 using namespace SIMD;
196
197 D128 m01_m10 = vsetd128(self->m01, self->m10);
198 D128 m20_m21 = vloadd128u(&self->m20);
199
200 size_t i = size;
201 if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) {
202 while (i >= 4) {
203 D128 s0 = vloadd128a(src + 0);
204 D128 s1 = vloadd128a(src + 1);
205 D128 s2 = vloadd128a(src + 2);
206 D128 s3 = vloadd128a(src + 3);
207
208 s0 = vmulpd(vswapd64(s0), m01_m10);
209 s1 = vmulpd(vswapd64(s1), m01_m10);
210 s2 = vmulpd(vswapd64(s2), m01_m10);
211 s3 = vmulpd(vswapd64(s3), m01_m10);
212
213 vstored128a(dst + 0, vaddpd(s0, m20_m21));
214 vstored128a(dst + 1, vaddpd(s1, m20_m21));
215 vstored128a(dst + 2, vaddpd(s2, m20_m21));
216 vstored128a(dst + 3, vaddpd(s3, m20_m21));
217
218 i -= 4;
219 dst += 4;
220 src += 4;
221 }
222
223 while (i) {
224 D128 s0 = vloadd128a(src);
225 s0 = vmulpd(vswapd64(s0), m01_m10);
226 vstored128a(dst, vaddpd(s0, m20_m21));
227
228 i--;
229 dst++;
230 src++;
231 }
232 }
233 else {
234 while (i >= 4) {
235 D128 s0 = vloadd128u(src + 0);
236 D128 s1 = vloadd128u(src + 1);
237 D128 s2 = vloadd128u(src + 2);
238 D128 s3 = vloadd128u(src + 3);
239
240 s0 = vmulpd(vswapd64(s0), m01_m10);
241 s1 = vmulpd(vswapd64(s1), m01_m10);
242 s2 = vmulpd(vswapd64(s2), m01_m10);
243 s3 = vmulpd(vswapd64(s3), m01_m10);
244
245 vstored128u(dst + 0, vaddpd(s0, m20_m21));
246 vstored128u(dst + 1, vaddpd(s1, m20_m21));
247 vstored128u(dst + 2, vaddpd(s2, m20_m21));
248 vstored128u(dst + 3, vaddpd(s3, m20_m21));
249
250 i -= 4;
251 dst += 4;
252 src += 4;
253 }
254
255 while (i) {
256 D128 s0 = vloadd128u(src);
257 s0 = vmulpd(vswapd64(s0), m01_m10);
258 vstored128u(dst, vaddpd(s0, m20_m21));
259
260 i--;
261 dst++;
262 src++;
263 }
264 }
265
266 return BL_SUCCESS;
267}
268
269static BLResult BL_CDECL blMatrix2DMapPointDArrayAffine_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept {
270 using namespace SIMD;
271
272 size_t i = size;
273 D128 m00_m11 = vsetd128(self->m11, self->m00);
274 D128 m10_m01 = vsetd128(self->m01, self->m10);
275 D128 m20_m21 = vloadd128u(&self->m20);
276
277 if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) {
278 while (i >= 4) {
279 D128 s0 = vloadd128a(src + 0);
280 D128 s1 = vloadd128a(src + 1);
281 D128 s2 = vloadd128a(src + 2);
282 D128 s3 = vloadd128a(src + 3);
283
284 D128 r0 = vswapd64(s0);
285 D128 r1 = vswapd64(s1);
286 D128 r2 = vswapd64(s2);
287 D128 r3 = vswapd64(s3);
288
289 s0 = vmulpd(s0, m00_m11);
290 s1 = vmulpd(s1, m00_m11);
291 s2 = vmulpd(s2, m00_m11);
292 s3 = vmulpd(s3, m00_m11);
293
294 s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01));
295 s1 = vaddpd(vaddpd(s1, m20_m21), vmulpd(r1, m10_m01));
296 s2 = vaddpd(vaddpd(s2, m20_m21), vmulpd(r2, m10_m01));
297 s3 = vaddpd(vaddpd(s3, m20_m21), vmulpd(r3, m10_m01));
298
299 vstored128a(dst + 0, s0);
300 vstored128a(dst + 1, s1);
301 vstored128a(dst + 2, s2);
302 vstored128a(dst + 3, s3);
303
304 i -= 4;
305 dst += 4;
306 src += 4;
307 }
308
309 while (i) {
310 D128 s0 = vloadd128a(src);
311 D128 r0 = vswapd64(s0);
312
313 s0 = vmulpd(s0, m00_m11);
314 s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01));
315
316 vstored128a(dst, s0);
317
318 i--;
319 dst++;
320 src++;
321 }
322 }
323 else {
324 while (i >= 4) {
325 D128 s0 = vloadd128u(src + 0);
326 D128 s1 = vloadd128u(src + 1);
327 D128 s2 = vloadd128u(src + 2);
328 D128 s3 = vloadd128u(src + 3);
329
330 D128 r0 = vswapd64(s0);
331 D128 r1 = vswapd64(s1);
332 D128 r2 = vswapd64(s2);
333 D128 r3 = vswapd64(s3);
334
335 s0 = vmulpd(s0, m00_m11);
336 s1 = vmulpd(s1, m00_m11);
337 s2 = vmulpd(s2, m00_m11);
338 s3 = vmulpd(s3, m00_m11);
339
340 s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01));
341 s1 = vaddpd(vaddpd(s1, m20_m21), vmulpd(r1, m10_m01));
342 s2 = vaddpd(vaddpd(s2, m20_m21), vmulpd(r2, m10_m01));
343 s3 = vaddpd(vaddpd(s3, m20_m21), vmulpd(r3, m10_m01));
344
345 vstored128u(dst + 0, s0);
346 vstored128u(dst + 1, s1);
347 vstored128u(dst + 2, s2);
348 vstored128u(dst + 3, s3);
349
350 i -= 4;
351 dst += 4;
352 src += 4;
353 }
354
355 while (i) {
356 D128 s0 = vloadd128u(src);
357 D128 r0 = vswapd64(s0);
358
359 s0 = vmulpd(s0, m00_m11);
360 s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01));
361
362 vstored128u(dst, s0);
363
364 i--;
365 dst++;
366 src++;
367 }
368 }
369
370 return BL_SUCCESS;
371}
372
373// ============================================================================
374// [BLMatrix2D - Runtime Init [SSE2]]
375// ============================================================================
376
377BL_HIDDEN void blMatrix2DRtInit_SSE2(BLRuntimeContext* rt) noexcept {
378 BL_UNUSED(rt);
379 BLMapPointDArrayFunc* funcs = blMatrix2DMapPointDArrayFuncs;
380
381 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_IDENTITY ], blMatrix2DMapPointDArrayIdentity_SSE2);
382 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_TRANSLATE], blMatrix2DMapPointDArrayTranslate_SSE2);
383 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SCALE ], blMatrix2DMapPointDArrayScale_SSE2);
384 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SWAP ], blMatrix2DMapPointDArraySwap_SSE2);
385 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_AFFINE ], blMatrix2DMapPointDArrayAffine_SSE2);
386 blAssignFunc(&funcs[BL_MATRIX2D_TYPE_INVALID ], blMatrix2DMapPointDArrayAffine_SSE2);
387}
388
389#endif
390