1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_TARGET_OPT_SSE2 |
9 | |
10 | #include "./blgeometry.h" |
11 | #include "./blmatrix.h" |
12 | #include "./blruntime_p.h" |
13 | #include "./blsimd_p.h" |
14 | #include "./blsupport_p.h" |
15 | |
16 | // ============================================================================ |
17 | // [BLMatrix2D - MapPointDArray [SSE2]] |
18 | // ============================================================================ |
19 | |
20 | static BLResult BL_CDECL blMatrix2DMapPointDArrayIdentity_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
21 | using namespace SIMD; |
22 | |
23 | BL_UNUSED(self); |
24 | if (dst == src) |
25 | return BL_SUCCESS; |
26 | |
27 | size_t i = size; |
28 | if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) { |
29 | while (i >= 4) { |
30 | D128 s0 = vloadd128a(src + 0); |
31 | D128 s1 = vloadd128a(src + 1); |
32 | D128 s2 = vloadd128a(src + 2); |
33 | D128 s3 = vloadd128a(src + 3); |
34 | |
35 | vstored128a(dst + 0, s0); |
36 | vstored128a(dst + 1, s1); |
37 | vstored128a(dst + 2, s2); |
38 | vstored128a(dst + 3, s3); |
39 | |
40 | i -= 4; |
41 | dst += 4; |
42 | src += 4; |
43 | } |
44 | |
45 | while (i) { |
46 | vstored128a(dst, vloadd128a(src)); |
47 | i--; |
48 | dst++; |
49 | src++; |
50 | } |
51 | } |
52 | else { |
53 | while (i) { |
54 | D128 sx = vloadd128_64(&src->x); |
55 | D128 sy = vloadd128_64(&src->y); |
56 | |
57 | vstored64(&dst->x, sx); |
58 | vstored64(&dst->y, sy); |
59 | |
60 | i--; |
61 | dst++; |
62 | src++; |
63 | } |
64 | } |
65 | |
66 | return BL_SUCCESS; |
67 | } |
68 | |
69 | static BLResult BL_CDECL blMatrix2DMapPointDArrayTranslate_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
70 | using namespace SIMD; |
71 | |
72 | size_t i = size; |
73 | D128 m20_m21 = vloadd128u(&self->m20); |
74 | |
75 | if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) { |
76 | while (i >= 4) { |
77 | D128 s0 = vloadd128a(src + 0); |
78 | D128 s1 = vloadd128a(src + 1); |
79 | D128 s2 = vloadd128a(src + 2); |
80 | D128 s3 = vloadd128a(src + 3); |
81 | |
82 | vstored128a(dst + 0, vaddpd(s0, m20_m21)); |
83 | vstored128a(dst + 1, vaddpd(s1, m20_m21)); |
84 | vstored128a(dst + 2, vaddpd(s2, m20_m21)); |
85 | vstored128a(dst + 3, vaddpd(s3, m20_m21)); |
86 | |
87 | i -= 4; |
88 | dst += 4; |
89 | src += 4; |
90 | } |
91 | |
92 | while (i) { |
93 | D128 s0 = vloadd128a(src); |
94 | vstored128a(dst, vaddpd(s0, m20_m21)); |
95 | |
96 | i--; |
97 | dst++; |
98 | src++; |
99 | } |
100 | } |
101 | else { |
102 | while (i >= 4) { |
103 | D128 s0 = vloadd128u(src + 0); |
104 | D128 s1 = vloadd128u(src + 1); |
105 | D128 s2 = vloadd128u(src + 2); |
106 | D128 s3 = vloadd128u(src + 3); |
107 | |
108 | vstored128u(dst + 0, vaddpd(s0, m20_m21)); |
109 | vstored128u(dst + 1, vaddpd(s1, m20_m21)); |
110 | vstored128u(dst + 2, vaddpd(s2, m20_m21)); |
111 | vstored128u(dst + 3, vaddpd(s3, m20_m21)); |
112 | |
113 | i -= 4; |
114 | dst += 4; |
115 | src += 4; |
116 | } |
117 | |
118 | while (i) { |
119 | D128 s0 = vloadd128u(src + 0); |
120 | vstored128u(dst + 0, vaddpd(s0, m20_m21)); |
121 | |
122 | i--; |
123 | dst++; |
124 | src++; |
125 | } |
126 | } |
127 | |
128 | return BL_SUCCESS; |
129 | } |
130 | |
131 | static BLResult BL_CDECL blMatrix2DMapPointDArrayScale_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
132 | using namespace SIMD; |
133 | |
134 | size_t i = size; |
135 | D128 m00_m11 = vsetd128(self->m11, self->m00); |
136 | D128 m20_m21 = vloadd128u(&self->m20); |
137 | |
138 | if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) { |
139 | while (i >= 4) { |
140 | D128 s0 = vloadd128a(src + 0); |
141 | D128 s1 = vloadd128a(src + 1); |
142 | D128 s2 = vloadd128a(src + 2); |
143 | D128 s3 = vloadd128a(src + 3); |
144 | |
145 | vstored128a(dst + 0, vaddpd(vmulpd(s0, m00_m11), m20_m21)); |
146 | vstored128a(dst + 1, vaddpd(vmulpd(s1, m00_m11), m20_m21)); |
147 | vstored128a(dst + 2, vaddpd(vmulpd(s2, m00_m11), m20_m21)); |
148 | vstored128a(dst + 3, vaddpd(vmulpd(s3, m00_m11), m20_m21)); |
149 | |
150 | i -= 4; |
151 | dst += 4; |
152 | src += 4; |
153 | } |
154 | |
155 | while (i) { |
156 | D128 s0 = vloadd128a(src); |
157 | vstored128a(dst, vaddpd(vmulpd(s0, m00_m11), m20_m21)); |
158 | |
159 | i--; |
160 | dst++; |
161 | src++; |
162 | } |
163 | } |
164 | else { |
165 | while (i >= 4) { |
166 | D128 s0 = vloadd128u(src + 0); |
167 | D128 s1 = vloadd128u(src + 1); |
168 | D128 s2 = vloadd128u(src + 2); |
169 | D128 s3 = vloadd128u(src + 3); |
170 | |
171 | vstored128u(dst + 0, vaddpd(vmulpd(s0, m00_m11), m20_m21)); |
172 | vstored128u(dst + 1, vaddpd(vmulpd(s1, m00_m11), m20_m21)); |
173 | vstored128u(dst + 2, vaddpd(vmulpd(s2, m00_m11), m20_m21)); |
174 | vstored128u(dst + 3, vaddpd(vmulpd(s3, m00_m11), m20_m21)); |
175 | |
176 | i -= 4; |
177 | dst += 4; |
178 | src += 4; |
179 | } |
180 | |
181 | while (i) { |
182 | D128 s0 = vloadd128u(src); |
183 | vstored128u(dst, vaddpd(vmulpd(s0, m00_m11), m20_m21)); |
184 | |
185 | i--; |
186 | dst++; |
187 | src++; |
188 | } |
189 | } |
190 | |
191 | return BL_SUCCESS; |
192 | } |
193 | |
194 | static BLResult BL_CDECL blMatrix2DMapPointDArraySwap_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
195 | using namespace SIMD; |
196 | |
197 | D128 m01_m10 = vsetd128(self->m01, self->m10); |
198 | D128 m20_m21 = vloadd128u(&self->m20); |
199 | |
200 | size_t i = size; |
201 | if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) { |
202 | while (i >= 4) { |
203 | D128 s0 = vloadd128a(src + 0); |
204 | D128 s1 = vloadd128a(src + 1); |
205 | D128 s2 = vloadd128a(src + 2); |
206 | D128 s3 = vloadd128a(src + 3); |
207 | |
208 | s0 = vmulpd(vswapd64(s0), m01_m10); |
209 | s1 = vmulpd(vswapd64(s1), m01_m10); |
210 | s2 = vmulpd(vswapd64(s2), m01_m10); |
211 | s3 = vmulpd(vswapd64(s3), m01_m10); |
212 | |
213 | vstored128a(dst + 0, vaddpd(s0, m20_m21)); |
214 | vstored128a(dst + 1, vaddpd(s1, m20_m21)); |
215 | vstored128a(dst + 2, vaddpd(s2, m20_m21)); |
216 | vstored128a(dst + 3, vaddpd(s3, m20_m21)); |
217 | |
218 | i -= 4; |
219 | dst += 4; |
220 | src += 4; |
221 | } |
222 | |
223 | while (i) { |
224 | D128 s0 = vloadd128a(src); |
225 | s0 = vmulpd(vswapd64(s0), m01_m10); |
226 | vstored128a(dst, vaddpd(s0, m20_m21)); |
227 | |
228 | i--; |
229 | dst++; |
230 | src++; |
231 | } |
232 | } |
233 | else { |
234 | while (i >= 4) { |
235 | D128 s0 = vloadd128u(src + 0); |
236 | D128 s1 = vloadd128u(src + 1); |
237 | D128 s2 = vloadd128u(src + 2); |
238 | D128 s3 = vloadd128u(src + 3); |
239 | |
240 | s0 = vmulpd(vswapd64(s0), m01_m10); |
241 | s1 = vmulpd(vswapd64(s1), m01_m10); |
242 | s2 = vmulpd(vswapd64(s2), m01_m10); |
243 | s3 = vmulpd(vswapd64(s3), m01_m10); |
244 | |
245 | vstored128u(dst + 0, vaddpd(s0, m20_m21)); |
246 | vstored128u(dst + 1, vaddpd(s1, m20_m21)); |
247 | vstored128u(dst + 2, vaddpd(s2, m20_m21)); |
248 | vstored128u(dst + 3, vaddpd(s3, m20_m21)); |
249 | |
250 | i -= 4; |
251 | dst += 4; |
252 | src += 4; |
253 | } |
254 | |
255 | while (i) { |
256 | D128 s0 = vloadd128u(src); |
257 | s0 = vmulpd(vswapd64(s0), m01_m10); |
258 | vstored128u(dst, vaddpd(s0, m20_m21)); |
259 | |
260 | i--; |
261 | dst++; |
262 | src++; |
263 | } |
264 | } |
265 | |
266 | return BL_SUCCESS; |
267 | } |
268 | |
269 | static BLResult BL_CDECL blMatrix2DMapPointDArrayAffine_SSE2(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
270 | using namespace SIMD; |
271 | |
272 | size_t i = size; |
273 | D128 m00_m11 = vsetd128(self->m11, self->m00); |
274 | D128 m10_m01 = vsetd128(self->m01, self->m10); |
275 | D128 m20_m21 = vloadd128u(&self->m20); |
276 | |
277 | if (blIsAligned(((uintptr_t)dst | (uintptr_t)src), 16)) { |
278 | while (i >= 4) { |
279 | D128 s0 = vloadd128a(src + 0); |
280 | D128 s1 = vloadd128a(src + 1); |
281 | D128 s2 = vloadd128a(src + 2); |
282 | D128 s3 = vloadd128a(src + 3); |
283 | |
284 | D128 r0 = vswapd64(s0); |
285 | D128 r1 = vswapd64(s1); |
286 | D128 r2 = vswapd64(s2); |
287 | D128 r3 = vswapd64(s3); |
288 | |
289 | s0 = vmulpd(s0, m00_m11); |
290 | s1 = vmulpd(s1, m00_m11); |
291 | s2 = vmulpd(s2, m00_m11); |
292 | s3 = vmulpd(s3, m00_m11); |
293 | |
294 | s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01)); |
295 | s1 = vaddpd(vaddpd(s1, m20_m21), vmulpd(r1, m10_m01)); |
296 | s2 = vaddpd(vaddpd(s2, m20_m21), vmulpd(r2, m10_m01)); |
297 | s3 = vaddpd(vaddpd(s3, m20_m21), vmulpd(r3, m10_m01)); |
298 | |
299 | vstored128a(dst + 0, s0); |
300 | vstored128a(dst + 1, s1); |
301 | vstored128a(dst + 2, s2); |
302 | vstored128a(dst + 3, s3); |
303 | |
304 | i -= 4; |
305 | dst += 4; |
306 | src += 4; |
307 | } |
308 | |
309 | while (i) { |
310 | D128 s0 = vloadd128a(src); |
311 | D128 r0 = vswapd64(s0); |
312 | |
313 | s0 = vmulpd(s0, m00_m11); |
314 | s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01)); |
315 | |
316 | vstored128a(dst, s0); |
317 | |
318 | i--; |
319 | dst++; |
320 | src++; |
321 | } |
322 | } |
323 | else { |
324 | while (i >= 4) { |
325 | D128 s0 = vloadd128u(src + 0); |
326 | D128 s1 = vloadd128u(src + 1); |
327 | D128 s2 = vloadd128u(src + 2); |
328 | D128 s3 = vloadd128u(src + 3); |
329 | |
330 | D128 r0 = vswapd64(s0); |
331 | D128 r1 = vswapd64(s1); |
332 | D128 r2 = vswapd64(s2); |
333 | D128 r3 = vswapd64(s3); |
334 | |
335 | s0 = vmulpd(s0, m00_m11); |
336 | s1 = vmulpd(s1, m00_m11); |
337 | s2 = vmulpd(s2, m00_m11); |
338 | s3 = vmulpd(s3, m00_m11); |
339 | |
340 | s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01)); |
341 | s1 = vaddpd(vaddpd(s1, m20_m21), vmulpd(r1, m10_m01)); |
342 | s2 = vaddpd(vaddpd(s2, m20_m21), vmulpd(r2, m10_m01)); |
343 | s3 = vaddpd(vaddpd(s3, m20_m21), vmulpd(r3, m10_m01)); |
344 | |
345 | vstored128u(dst + 0, s0); |
346 | vstored128u(dst + 1, s1); |
347 | vstored128u(dst + 2, s2); |
348 | vstored128u(dst + 3, s3); |
349 | |
350 | i -= 4; |
351 | dst += 4; |
352 | src += 4; |
353 | } |
354 | |
355 | while (i) { |
356 | D128 s0 = vloadd128u(src); |
357 | D128 r0 = vswapd64(s0); |
358 | |
359 | s0 = vmulpd(s0, m00_m11); |
360 | s0 = vaddpd(vaddpd(s0, m20_m21), vmulpd(r0, m10_m01)); |
361 | |
362 | vstored128u(dst, s0); |
363 | |
364 | i--; |
365 | dst++; |
366 | src++; |
367 | } |
368 | } |
369 | |
370 | return BL_SUCCESS; |
371 | } |
372 | |
373 | // ============================================================================ |
374 | // [BLMatrix2D - Runtime Init [SSE2]] |
375 | // ============================================================================ |
376 | |
377 | BL_HIDDEN void blMatrix2DRtInit_SSE2(BLRuntimeContext* rt) noexcept { |
378 | BL_UNUSED(rt); |
379 | BLMapPointDArrayFunc* funcs = blMatrix2DMapPointDArrayFuncs; |
380 | |
381 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_IDENTITY ], blMatrix2DMapPointDArrayIdentity_SSE2); |
382 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_TRANSLATE], blMatrix2DMapPointDArrayTranslate_SSE2); |
383 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SCALE ], blMatrix2DMapPointDArrayScale_SSE2); |
384 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SWAP ], blMatrix2DMapPointDArraySwap_SSE2); |
385 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_AFFINE ], blMatrix2DMapPointDArrayAffine_SSE2); |
386 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_INVALID ], blMatrix2DMapPointDArrayAffine_SSE2); |
387 | } |
388 | |
389 | #endif |
390 | |