1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_TARGET_OPT_AVX |
9 | |
10 | #include "./blgeometry.h" |
11 | #include "./blmatrix.h" |
12 | #include "./blruntime_p.h" |
13 | #include "./blsimd_p.h" |
14 | #include "./blsupport_p.h" |
15 | |
16 | // ============================================================================ |
17 | // [BLMatrix2D - MapPointDArray [AVX]] |
18 | // ============================================================================ |
19 | |
20 | static BLResult BL_CDECL blMatrix2DMapPointDArrayIdentity_AVX(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
21 | using namespace SIMD; |
22 | |
23 | BL_UNUSED(self); |
24 | if (dst == src) |
25 | return BL_SUCCESS; |
26 | |
27 | size_t i = size; |
28 | while (i >= 8) { |
29 | vstored256u(dst + 0, vloadd256u(src + 0)); |
30 | vstored256u(dst + 2, vloadd256u(src + 2)); |
31 | vstored256u(dst + 4, vloadd256u(src + 4)); |
32 | vstored256u(dst + 6, vloadd256u(src + 6)); |
33 | |
34 | i -= 8; |
35 | dst += 8; |
36 | src += 8; |
37 | } |
38 | |
39 | while (i >= 2) { |
40 | vstored256u(dst, vloadd256u(src)); |
41 | |
42 | i -= 2; |
43 | dst += 2; |
44 | src += 2; |
45 | } |
46 | |
47 | if (i) |
48 | vstored128u(dst, vloadd128u(src)); |
49 | |
50 | return BL_SUCCESS; |
51 | } |
52 | |
53 | static BLResult BL_CDECL blMatrix2DMapPointDArrayTranslate_AVX(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
54 | using namespace SIMD; |
55 | |
56 | size_t i = size; |
57 | D256 m20_m21 = vbroadcastd256_128(&self->m20); |
58 | |
59 | while (i >= 8) { |
60 | vstored256u(dst + 0, vaddpd(vloadd256u(src + 0), m20_m21)); |
61 | vstored256u(dst + 2, vaddpd(vloadd256u(src + 2), m20_m21)); |
62 | vstored256u(dst + 4, vaddpd(vloadd256u(src + 4), m20_m21)); |
63 | vstored256u(dst + 6, vaddpd(vloadd256u(src + 6), m20_m21)); |
64 | |
65 | i -= 8; |
66 | dst += 8; |
67 | src += 8; |
68 | } |
69 | |
70 | while (i >= 2) { |
71 | vstored256u(dst, vaddpd(vloadd256u(src), m20_m21)); |
72 | |
73 | i -= 2; |
74 | dst += 2; |
75 | src += 2; |
76 | } |
77 | |
78 | if (i) |
79 | vstored128u(dst, vaddpd(vloadd128u(src), vcast<D128>(m20_m21))); |
80 | |
81 | return BL_SUCCESS; |
82 | } |
83 | |
84 | static BLResult BL_CDECL blMatrix2DMapPointDArrayScale_AVX(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
85 | using namespace SIMD; |
86 | |
87 | size_t i = size; |
88 | D256 m00_m11 = vdupld128(vsetd128(self->m11, self->m00)); |
89 | D256 m20_m21 = vbroadcastd256_128(&self->m20); |
90 | |
91 | while (i >= 8) { |
92 | vstored256u(dst + 0, vaddpd(vmulpd(vloadd256u(src + 0), m00_m11), m20_m21)); |
93 | vstored256u(dst + 2, vaddpd(vmulpd(vloadd256u(src + 2), m00_m11), m20_m21)); |
94 | vstored256u(dst + 4, vaddpd(vmulpd(vloadd256u(src + 4), m00_m11), m20_m21)); |
95 | vstored256u(dst + 6, vaddpd(vmulpd(vloadd256u(src + 6), m00_m11), m20_m21)); |
96 | |
97 | i -= 8; |
98 | dst += 8; |
99 | src += 8; |
100 | } |
101 | |
102 | while (i >= 2) { |
103 | vstored256u(dst, vaddpd(vmulpd(vloadd256u(src), m00_m11), m20_m21)); |
104 | |
105 | i -= 2; |
106 | dst += 2; |
107 | src += 2; |
108 | } |
109 | |
110 | if (i) |
111 | vstored128u(dst, vaddpd(vmulpd(vloadd128u(src), vcast<D128>(m00_m11)), vcast<D128>(m20_m21))); |
112 | |
113 | return BL_SUCCESS; |
114 | } |
115 | |
116 | static BLResult BL_CDECL blMatrix2DMapPointDArraySwap_AVX(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
117 | using namespace SIMD; |
118 | |
119 | size_t i = size; |
120 | D256 m01_m10 = vdupld128(vsetd128(self->m01, self->m10)); |
121 | D256 m20_m21 = vbroadcastd256_128(&self->m20); |
122 | |
123 | while (i >= 8) { |
124 | vstored256u(dst + 0, vaddpd(vmulpd(vswapd64(vloadd256u(src + 0)), m01_m10), m20_m21)); |
125 | vstored256u(dst + 2, vaddpd(vmulpd(vswapd64(vloadd256u(src + 2)), m01_m10), m20_m21)); |
126 | vstored256u(dst + 4, vaddpd(vmulpd(vswapd64(vloadd256u(src + 4)), m01_m10), m20_m21)); |
127 | vstored256u(dst + 6, vaddpd(vmulpd(vswapd64(vloadd256u(src + 6)), m01_m10), m20_m21)); |
128 | |
129 | i -= 8; |
130 | dst += 8; |
131 | src += 8; |
132 | } |
133 | |
134 | while (i >= 2) { |
135 | vstored256u(dst, vaddpd(vmulpd(vswapd64(vloadd256u(src)), m01_m10), m20_m21)); |
136 | |
137 | i -= 2; |
138 | dst += 2; |
139 | src += 2; |
140 | } |
141 | |
142 | if (i) |
143 | vstored128u(dst, vaddpd(vmulpd(vswapd64(vloadd128u(src)), vcast<D128>(m01_m10)), vcast<D128>(m20_m21))); |
144 | |
145 | return BL_SUCCESS; |
146 | } |
147 | |
148 | static BLResult BL_CDECL blMatrix2DMapPointDArrayAffine_AVX(const BLMatrix2D* self, BLPoint* dst, const BLPoint* src, size_t size) noexcept { |
149 | using namespace SIMD; |
150 | |
151 | size_t i = size; |
152 | D256 m00_m11 = vdupld128(vsetd128(self->m11, self->m00)); |
153 | D256 m10_m01 = vdupld128(vsetd128(self->m01, self->m10)); |
154 | D256 m20_m21 = vbroadcastd256_128(&self->m20); |
155 | |
156 | while (i >= 8) { |
157 | D256 s0 = vloadd256u(src + 0); |
158 | D256 s1 = vloadd256u(src + 2); |
159 | D256 s2 = vloadd256u(src + 4); |
160 | D256 s3 = vloadd256u(src + 6); |
161 | |
162 | vstored256u(dst + 0, vaddpd(vaddpd(vmulpd(s0, m00_m11), m20_m21), vmulpd(vswapd64(s0), m10_m01))); |
163 | vstored256u(dst + 2, vaddpd(vaddpd(vmulpd(s1, m00_m11), m20_m21), vmulpd(vswapd64(s1), m10_m01))); |
164 | vstored256u(dst + 4, vaddpd(vaddpd(vmulpd(s2, m00_m11), m20_m21), vmulpd(vswapd64(s2), m10_m01))); |
165 | vstored256u(dst + 6, vaddpd(vaddpd(vmulpd(s3, m00_m11), m20_m21), vmulpd(vswapd64(s3), m10_m01))); |
166 | |
167 | i -= 8; |
168 | dst += 8; |
169 | src += 8; |
170 | } |
171 | |
172 | while (i >= 2) { |
173 | D256 s0 = vloadd256u(src); |
174 | vstored256u(dst, vaddpd(vaddpd(vmulpd(s0, m00_m11), m20_m21), vmulpd(vswapd64(s0), m10_m01))); |
175 | |
176 | i -= 2; |
177 | dst += 2; |
178 | src += 2; |
179 | } |
180 | |
181 | if (i) { |
182 | D128 s0 = vloadd128u(src); |
183 | vstored128u(dst, vaddpd(vaddpd(vmulpd(s0, vcast<D128>(m00_m11)), vcast<D128>(m20_m21)), vmulpd(vswapd64(s0), vcast<D128>(m10_m01)))); |
184 | } |
185 | |
186 | return BL_SUCCESS; |
187 | } |
188 | |
189 | // ============================================================================ |
190 | // [BLMatrix2D - Runtime Init [AVX]] |
191 | // ============================================================================ |
192 | |
193 | BL_HIDDEN void blMatrix2DRtInit_AVX(BLRuntimeContext* rt) noexcept { |
194 | BL_UNUSED(rt); |
195 | BLMapPointDArrayFunc* funcs = blMatrix2DMapPointDArrayFuncs; |
196 | |
197 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_IDENTITY ], blMatrix2DMapPointDArrayIdentity_AVX); |
198 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_TRANSLATE], blMatrix2DMapPointDArrayTranslate_AVX); |
199 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SCALE ], blMatrix2DMapPointDArrayScale_AVX); |
200 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_SWAP ], blMatrix2DMapPointDArraySwap_AVX); |
201 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_AFFINE ], blMatrix2DMapPointDArrayAffine_AVX); |
202 | blAssignFunc(&funcs[BL_MATRIX2D_TYPE_INVALID ], blMatrix2DMapPointDArrayAffine_AVX); |
203 | } |
204 | |
205 | #endif |
206 | |