1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_BUILD_OPT_SSSE3 |
9 | |
10 | #include "./blpixelconverter_p.h" |
11 | #include "./blsimd_p.h" |
12 | #include "./blsupport_p.h" |
13 | |
14 | // ============================================================================ |
15 | // [BLPixelConverter - PRGB32 <- RGB24 (SSSE3)] |
16 | // ============================================================================ |
17 | |
18 | static BLResult BL_CDECL bl_convert_prgb32_from_rgb24_ssse3( |
19 | const BLPixelConverterCore* self, |
20 | uint8_t* dstData, intptr_t dstStride, |
21 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
22 | |
23 | using namespace SIMD; |
24 | if (!options) |
25 | options = &blPixelConverterDefaultOptions; |
26 | |
27 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
28 | size_t gap = options->gap; |
29 | |
30 | dstStride -= (w * 4) + gap; |
31 | srcStride -= (w * 3); |
32 | |
33 | I128 fillMask = vseti128u32(d.fillMask); |
34 | I128 predicate = vloadi128u(d.simdData); |
35 | |
36 | for (uint32_t y = h; y != 0; y--) { |
37 | uint32_t i = w; |
38 | |
39 | while (i >= 16) { |
40 | I128 p0, p1, p2, p3; |
41 | |
42 | p0 = vloadi128u(srcData + 0); // [x5|z4 y4 x4|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0] |
43 | p1 = vloadi128u(srcData + 16); // [yA|xA|z9 y9|x9 z8 y8 x8|z7 y7 x7 z6|y6 x6 z5 y5] |
44 | p3 = vloadi128u(srcData + 32); // [zF yF xF zE|yE xE zD yD|xD zC yC xC|zB yB xB zA] |
45 | |
46 | p2 = vpalignr<8>(p3, p1); // [-- -- -- --|zB yB xB zA|yA|xA|z9 y9|x9 z8 y8 x8] |
47 | p1 = vpalignr<12>(p1, p0); // [-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5|x5|z4 y4 x4] |
48 | p3 = vsrli128b<4>(p3); // [-- -- -- --|zF yF xF zE|yE xE zD yD|xD zC yC xC] |
49 | |
50 | p0 = vpshufb(p0, predicate); |
51 | p1 = vpshufb(p1, predicate); |
52 | p2 = vpshufb(p2, predicate); |
53 | p3 = vpshufb(p3, predicate); |
54 | |
55 | p0 = vor(p0, fillMask); |
56 | p1 = vor(p1, fillMask); |
57 | p2 = vor(p2, fillMask); |
58 | p3 = vor(p3, fillMask); |
59 | |
60 | vstorei128u(dstData + 0, p0); |
61 | vstorei128u(dstData + 16, p1); |
62 | vstorei128u(dstData + 32, p2); |
63 | vstorei128u(dstData + 48, p3); |
64 | |
65 | dstData += 64; |
66 | srcData += 48; |
67 | i -= 16; |
68 | } |
69 | |
70 | if (i >= 8) { |
71 | I128 p0, p1; |
72 | |
73 | p0 = vloadi128u (srcData + 0); // [x5|z4 y4 x4|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0] |
74 | p1 = vloadi128_64(srcData + 16); // [-- -- -- --|-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5] |
75 | |
76 | p1 = vpalignr<12>(p1, p0); // [-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5|x5|z4 y4 x4] |
77 | |
78 | p0 = vpshufb(p0, predicate); |
79 | p1 = vpshufb(p1, predicate); |
80 | |
81 | p0 = vor(p0, fillMask); |
82 | p1 = vor(p1, fillMask); |
83 | |
84 | vstorei128u(dstData + 0, p0); |
85 | vstorei128u(dstData + 16, p1); |
86 | |
87 | dstData += 32; |
88 | srcData += 24; |
89 | i -= 8; |
90 | } |
91 | |
92 | if (i >= 4) { |
93 | I128 p0, p1; |
94 | |
95 | p0 = vloadi128_64(srcData + 0); // [-- -- -- --|-- -- -- --|y2 x2 z1 y1|x1 z0 y0 x0] |
96 | p1 = vloadi128_32(srcData + 8); // [-- -- -- --|-- -- -- --|-- -- -- --|z3 y3 x3 z2] |
97 | |
98 | p0 = vunpackli64(p0, p1); // [-- -- -- --|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0] |
99 | p0 = vpshufb(p0, predicate); |
100 | p0 = vor(p0, fillMask); |
101 | |
102 | vstorei128u(dstData + 0, p0); |
103 | |
104 | dstData += 16; |
105 | srcData += 12; |
106 | i -= 4; |
107 | } |
108 | |
109 | while (i) { |
110 | uint32_t yx = blMemReadU16u(srcData + 0); |
111 | uint32_t z = blMemReadU8(srcData + 2); |
112 | |
113 | I128 p0 = vcvtu32i128((z << 16) | yx); |
114 | p0 = vpshufb(p0, predicate); |
115 | p0 = vor(p0, fillMask); |
116 | vstorei32(dstData, p0); |
117 | |
118 | dstData += 4; |
119 | srcData += 3; |
120 | i--; |
121 | } |
122 | |
123 | dstData = blPixelConverterFillGap(dstData, gap); |
124 | dstData += dstStride; |
125 | srcData += srcStride; |
126 | } |
127 | |
128 | return BL_SUCCESS; |
129 | } |
130 | |
131 | // ============================================================================ |
132 | // [BLPixelConverter - PRGB32 <- XRGB32 (SSSE3)] |
133 | // ============================================================================ |
134 | |
135 | static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_ssse3( |
136 | const BLPixelConverterCore* self, |
137 | uint8_t* dstData, intptr_t dstStride, |
138 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
139 | |
140 | using namespace SIMD; |
141 | if (!options) |
142 | options = &blPixelConverterDefaultOptions; |
143 | |
144 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
145 | size_t gap = options->gap; |
146 | |
147 | dstStride -= (w * 4) + gap; |
148 | srcStride -= (w * 4); |
149 | |
150 | using namespace SIMD; |
151 | I128 fillMask = vseti128u32(d.fillMask); |
152 | I128 predicate = vloadi128u(d.simdData); |
153 | |
154 | for (uint32_t y = h; y != 0; y--) { |
155 | uint32_t i = w; |
156 | |
157 | while (i >= 16) { |
158 | I128 p0, p1, p2, p3; |
159 | |
160 | p0 = vloadi128u(srcData + 0); |
161 | p1 = vloadi128u(srcData + 16); |
162 | p2 = vloadi128u(srcData + 32); |
163 | p3 = vloadi128u(srcData + 48); |
164 | |
165 | p0 = vpshufb(p0, predicate); |
166 | p1 = vpshufb(p1, predicate); |
167 | p2 = vpshufb(p2, predicate); |
168 | p3 = vpshufb(p3, predicate); |
169 | |
170 | p0 = vor(p0, fillMask); |
171 | p1 = vor(p1, fillMask); |
172 | p2 = vor(p2, fillMask); |
173 | p3 = vor(p3, fillMask); |
174 | |
175 | vstorei128u(dstData + 0, p0); |
176 | vstorei128u(dstData + 16, p1); |
177 | vstorei128u(dstData + 32, p2); |
178 | vstorei128u(dstData + 48, p3); |
179 | |
180 | dstData += 64; |
181 | srcData += 64; |
182 | i -= 16; |
183 | } |
184 | |
185 | while (i >= 4) { |
186 | I128 p0; |
187 | |
188 | p0 = vloadi128u(srcData); |
189 | p0 = vpshufb(p0, predicate); |
190 | p0 = vor(p0, fillMask); |
191 | vstorei128u(dstData, p0); |
192 | |
193 | dstData += 16; |
194 | srcData += 16; |
195 | i -= 4; |
196 | } |
197 | |
198 | while (i) { |
199 | I128 p0; |
200 | |
201 | p0 = vloadi128_32(srcData); |
202 | p0 = vpshufb(p0, predicate); |
203 | p0 = vor(p0, fillMask); |
204 | vstorei32(dstData, p0); |
205 | |
206 | dstData += 4; |
207 | srcData += 4; |
208 | i--; |
209 | } |
210 | |
211 | dstData = blPixelConverterFillGap(dstData, gap); |
212 | dstData += dstStride; |
213 | srcData += srcStride; |
214 | } |
215 | |
216 | return BL_SUCCESS; |
217 | } |
218 | |
219 | // ============================================================================ |
220 | // [BLPixelConverter - PRGB32 <- ARGB32 (SSSE3)] |
221 | // ============================================================================ |
222 | |
223 | static BLResult BL_CDECL bl_convert_prgb32_from_argb32_ssse3( |
224 | const BLPixelConverterCore* self, |
225 | uint8_t* dstData, intptr_t dstStride, |
226 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
227 | |
228 | using namespace SIMD; |
229 | if (!options) |
230 | options = &blPixelConverterDefaultOptions; |
231 | |
232 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
233 | size_t gap = options->gap; |
234 | |
235 | dstStride -= (w * 4) + gap; |
236 | srcStride -= (w * 4); |
237 | |
238 | I128 zero = vzeroi128(); |
239 | I128 a255 = vseti128u64(0x00FF000000000000u); |
240 | I128 fillMask = vseti128u32(d.fillMask); |
241 | I128 predicate = vloadi128u(d.simdData); |
242 | |
243 | for (uint32_t y = h; y != 0; y--) { |
244 | uint32_t i = w; |
245 | |
246 | while (i >= 4) { |
247 | I128 p0, p1; |
248 | I128 a0, a1; |
249 | |
250 | p0 = vloadi128u(srcData); |
251 | p0 = vpshufb(p0, predicate); |
252 | |
253 | p1 = vunpackhi8(p0, zero); |
254 | p0 = vunpackli8(p0, zero); |
255 | |
256 | a1 = vswizi16<3, 3, 3, 3>(p1); |
257 | p1 = vor(p1, a255); |
258 | |
259 | a0 = vswizi16<3, 3, 3, 3>(p0); |
260 | p0 = vor(p0, a255); |
261 | |
262 | p1 = vdiv255u16(vmuli16(p1, a1)); |
263 | p0 = vdiv255u16(vmuli16(p0, a0)); |
264 | p0 = vpacki16u8(p0, p1); |
265 | p0 = vor(p0, fillMask); |
266 | vstorei128u(dstData, p0); |
267 | |
268 | dstData += 16; |
269 | srcData += 16; |
270 | i -= 4; |
271 | } |
272 | |
273 | while (i) { |
274 | I128 p0; |
275 | I128 a0; |
276 | |
277 | p0 = vloadi128_32(srcData); |
278 | p0 = vpshufb(p0, predicate); |
279 | p0 = vunpackli8(p0, zero); |
280 | a0 = vswizi16<3, 3, 3, 3>(p0); |
281 | p0 = vor(p0, a255); |
282 | p0 = vdiv255u16(vmuli16(p0, a0)); |
283 | p0 = vpacki16u8(p0, p0); |
284 | p0 = vor(p0, fillMask); |
285 | vstorei32(dstData, p0); |
286 | |
287 | dstData += 4; |
288 | srcData += 4; |
289 | i--; |
290 | } |
291 | |
292 | dstData = blPixelConverterFillGap(dstData, gap); |
293 | dstData += dstStride; |
294 | srcData += srcStride; |
295 | } |
296 | |
297 | return BL_SUCCESS; |
298 | } |
299 | |
300 | // ============================================================================ |
301 | // [BLPixelConverter - Init (SSSE3)] |
302 | // ============================================================================ |
303 | |
304 | static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate24(const BLPixelConverterData::NativeFromExternal& d) noexcept { |
305 | uint32_t aIndex = 0x80u; |
306 | uint32_t rIndex = uint32_t(d.shifts[0]) >> 3; |
307 | uint32_t gIndex = uint32_t(d.shifts[1]) >> 3; |
308 | uint32_t bIndex = uint32_t(d.shifts[2]) >> 3; |
309 | |
310 | return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24); |
311 | } |
312 | |
313 | static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate32(const BLPixelConverterData::NativeFromExternal& d) noexcept { |
314 | uint32_t rIndex = uint32_t(d.shifts[0]) >> 3; |
315 | uint32_t gIndex = uint32_t(d.shifts[1]) >> 3; |
316 | uint32_t bIndex = uint32_t(d.shifts[2]) >> 3; |
317 | uint32_t aIndex = uint32_t(d.shifts[3]) >> 3; |
318 | |
319 | return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24); |
320 | } |
321 | |
322 | bool blPixelConverterInitNativeFromXRGB_SSSE3(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept { |
323 | BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
324 | if (srcInfo.depth == 24) { |
325 | // Only BYTE aligned components (888 format). |
326 | if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED)) |
327 | return false; |
328 | |
329 | // We expect RGB components in any order, but not alpha. |
330 | if (d.masks[0] != 0) |
331 | return false; |
332 | |
333 | switch (dstFormat) { |
334 | case BL_FORMAT_XRGB32: |
335 | case BL_FORMAT_PRGB32: |
336 | d.simdData[0] = blPixelConverterMakePshufbPredicate24(d); |
337 | d.simdData[1] = d.simdData[0] + 0x00030303u; |
338 | d.simdData[2] = d.simdData[0] + 0x00060606u; |
339 | d.simdData[3] = d.simdData[0] + 0x00090909u; |
340 | |
341 | self->convertFunc = bl_convert_prgb32_from_rgb24_ssse3; |
342 | return true; |
343 | |
344 | default: |
345 | return false; |
346 | } |
347 | } |
348 | else if (srcInfo.depth == 32) { |
349 | // Only BYTE aligned components (8888 or X888 formats). |
350 | if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED)) |
351 | return false; |
352 | |
353 | // This combination is provided by SSE2 converter and doesn't use PSHUFB. |
354 | // It's better on machines where PSHUFB is slow like ATOMs. |
355 | if (d.shifts[0] == 16 && d.shifts[1] == 8 && d.shifts[2] == 0) |
356 | return false; |
357 | |
358 | bool isARGB = (srcInfo.flags & BL_FORMAT_FLAG_ALPHA) != 0; |
359 | bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0; |
360 | |
361 | switch (dstFormat) { |
362 | case BL_FORMAT_XRGB32: |
363 | case BL_FORMAT_PRGB32: |
364 | d.simdData[0] = blPixelConverterMakePshufbPredicate32(d); |
365 | d.simdData[1] = d.simdData[0] + 0x04040404u; |
366 | d.simdData[2] = d.simdData[0] + 0x08080808u; |
367 | d.simdData[3] = d.simdData[0] + 0x0C0C0C0Cu; |
368 | |
369 | self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_ssse3 |
370 | : bl_convert_prgb32_from_xrgb32_ssse3; |
371 | return true; |
372 | |
373 | default: |
374 | return false; |
375 | } |
376 | } |
377 | |
378 | return false; |
379 | } |
380 | |
381 | #endif |
382 | |