1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_BUILD_OPT_SSSE3
9
10#include "./blpixelconverter_p.h"
11#include "./blsimd_p.h"
12#include "./blsupport_p.h"
13
14// ============================================================================
15// [BLPixelConverter - PRGB32 <- RGB24 (SSSE3)]
16// ============================================================================
17
18static BLResult BL_CDECL bl_convert_prgb32_from_rgb24_ssse3(
19 const BLPixelConverterCore* self,
20 uint8_t* dstData, intptr_t dstStride,
21 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
22
23 using namespace SIMD;
24 if (!options)
25 options = &blPixelConverterDefaultOptions;
26
27 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
28 size_t gap = options->gap;
29
30 dstStride -= (w * 4) + gap;
31 srcStride -= (w * 3);
32
33 I128 fillMask = vseti128u32(d.fillMask);
34 I128 predicate = vloadi128u(d.simdData);
35
36 for (uint32_t y = h; y != 0; y--) {
37 uint32_t i = w;
38
39 while (i >= 16) {
40 I128 p0, p1, p2, p3;
41
42 p0 = vloadi128u(srcData + 0); // [x5|z4 y4 x4|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0]
43 p1 = vloadi128u(srcData + 16); // [yA|xA|z9 y9|x9 z8 y8 x8|z7 y7 x7 z6|y6 x6 z5 y5]
44 p3 = vloadi128u(srcData + 32); // [zF yF xF zE|yE xE zD yD|xD zC yC xC|zB yB xB zA]
45
46 p2 = vpalignr<8>(p3, p1); // [-- -- -- --|zB yB xB zA|yA|xA|z9 y9|x9 z8 y8 x8]
47 p1 = vpalignr<12>(p1, p0); // [-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5|x5|z4 y4 x4]
48 p3 = vsrli128b<4>(p3); // [-- -- -- --|zF yF xF zE|yE xE zD yD|xD zC yC xC]
49
50 p0 = vpshufb(p0, predicate);
51 p1 = vpshufb(p1, predicate);
52 p2 = vpshufb(p2, predicate);
53 p3 = vpshufb(p3, predicate);
54
55 p0 = vor(p0, fillMask);
56 p1 = vor(p1, fillMask);
57 p2 = vor(p2, fillMask);
58 p3 = vor(p3, fillMask);
59
60 vstorei128u(dstData + 0, p0);
61 vstorei128u(dstData + 16, p1);
62 vstorei128u(dstData + 32, p2);
63 vstorei128u(dstData + 48, p3);
64
65 dstData += 64;
66 srcData += 48;
67 i -= 16;
68 }
69
70 if (i >= 8) {
71 I128 p0, p1;
72
73 p0 = vloadi128u (srcData + 0); // [x5|z4 y4 x4|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0]
74 p1 = vloadi128_64(srcData + 16); // [-- -- -- --|-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5]
75
76 p1 = vpalignr<12>(p1, p0); // [-- -- -- --|z7 y7 x7 z6|y6 x6 z5 y5|x5|z4 y4 x4]
77
78 p0 = vpshufb(p0, predicate);
79 p1 = vpshufb(p1, predicate);
80
81 p0 = vor(p0, fillMask);
82 p1 = vor(p1, fillMask);
83
84 vstorei128u(dstData + 0, p0);
85 vstorei128u(dstData + 16, p1);
86
87 dstData += 32;
88 srcData += 24;
89 i -= 8;
90 }
91
92 if (i >= 4) {
93 I128 p0, p1;
94
95 p0 = vloadi128_64(srcData + 0); // [-- -- -- --|-- -- -- --|y2 x2 z1 y1|x1 z0 y0 x0]
96 p1 = vloadi128_32(srcData + 8); // [-- -- -- --|-- -- -- --|-- -- -- --|z3 y3 x3 z2]
97
98 p0 = vunpackli64(p0, p1); // [-- -- -- --|z3 y3 x3 z2|y2 x2 z1 y1|x1 z0 y0 x0]
99 p0 = vpshufb(p0, predicate);
100 p0 = vor(p0, fillMask);
101
102 vstorei128u(dstData + 0, p0);
103
104 dstData += 16;
105 srcData += 12;
106 i -= 4;
107 }
108
109 while (i) {
110 uint32_t yx = blMemReadU16u(srcData + 0);
111 uint32_t z = blMemReadU8(srcData + 2);
112
113 I128 p0 = vcvtu32i128((z << 16) | yx);
114 p0 = vpshufb(p0, predicate);
115 p0 = vor(p0, fillMask);
116 vstorei32(dstData, p0);
117
118 dstData += 4;
119 srcData += 3;
120 i--;
121 }
122
123 dstData = blPixelConverterFillGap(dstData, gap);
124 dstData += dstStride;
125 srcData += srcStride;
126 }
127
128 return BL_SUCCESS;
129}
130
131// ============================================================================
132// [BLPixelConverter - PRGB32 <- XRGB32 (SSSE3)]
133// ============================================================================
134
135static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_ssse3(
136 const BLPixelConverterCore* self,
137 uint8_t* dstData, intptr_t dstStride,
138 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
139
140 using namespace SIMD;
141 if (!options)
142 options = &blPixelConverterDefaultOptions;
143
144 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
145 size_t gap = options->gap;
146
147 dstStride -= (w * 4) + gap;
148 srcStride -= (w * 4);
149
150 using namespace SIMD;
151 I128 fillMask = vseti128u32(d.fillMask);
152 I128 predicate = vloadi128u(d.simdData);
153
154 for (uint32_t y = h; y != 0; y--) {
155 uint32_t i = w;
156
157 while (i >= 16) {
158 I128 p0, p1, p2, p3;
159
160 p0 = vloadi128u(srcData + 0);
161 p1 = vloadi128u(srcData + 16);
162 p2 = vloadi128u(srcData + 32);
163 p3 = vloadi128u(srcData + 48);
164
165 p0 = vpshufb(p0, predicate);
166 p1 = vpshufb(p1, predicate);
167 p2 = vpshufb(p2, predicate);
168 p3 = vpshufb(p3, predicate);
169
170 p0 = vor(p0, fillMask);
171 p1 = vor(p1, fillMask);
172 p2 = vor(p2, fillMask);
173 p3 = vor(p3, fillMask);
174
175 vstorei128u(dstData + 0, p0);
176 vstorei128u(dstData + 16, p1);
177 vstorei128u(dstData + 32, p2);
178 vstorei128u(dstData + 48, p3);
179
180 dstData += 64;
181 srcData += 64;
182 i -= 16;
183 }
184
185 while (i >= 4) {
186 I128 p0;
187
188 p0 = vloadi128u(srcData);
189 p0 = vpshufb(p0, predicate);
190 p0 = vor(p0, fillMask);
191 vstorei128u(dstData, p0);
192
193 dstData += 16;
194 srcData += 16;
195 i -= 4;
196 }
197
198 while (i) {
199 I128 p0;
200
201 p0 = vloadi128_32(srcData);
202 p0 = vpshufb(p0, predicate);
203 p0 = vor(p0, fillMask);
204 vstorei32(dstData, p0);
205
206 dstData += 4;
207 srcData += 4;
208 i--;
209 }
210
211 dstData = blPixelConverterFillGap(dstData, gap);
212 dstData += dstStride;
213 srcData += srcStride;
214 }
215
216 return BL_SUCCESS;
217}
218
219// ============================================================================
220// [BLPixelConverter - PRGB32 <- ARGB32 (SSSE3)]
221// ============================================================================
222
223static BLResult BL_CDECL bl_convert_prgb32_from_argb32_ssse3(
224 const BLPixelConverterCore* self,
225 uint8_t* dstData, intptr_t dstStride,
226 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
227
228 using namespace SIMD;
229 if (!options)
230 options = &blPixelConverterDefaultOptions;
231
232 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
233 size_t gap = options->gap;
234
235 dstStride -= (w * 4) + gap;
236 srcStride -= (w * 4);
237
238 I128 zero = vzeroi128();
239 I128 a255 = vseti128u64(0x00FF000000000000u);
240 I128 fillMask = vseti128u32(d.fillMask);
241 I128 predicate = vloadi128u(d.simdData);
242
243 for (uint32_t y = h; y != 0; y--) {
244 uint32_t i = w;
245
246 while (i >= 4) {
247 I128 p0, p1;
248 I128 a0, a1;
249
250 p0 = vloadi128u(srcData);
251 p0 = vpshufb(p0, predicate);
252
253 p1 = vunpackhi8(p0, zero);
254 p0 = vunpackli8(p0, zero);
255
256 a1 = vswizi16<3, 3, 3, 3>(p1);
257 p1 = vor(p1, a255);
258
259 a0 = vswizi16<3, 3, 3, 3>(p0);
260 p0 = vor(p0, a255);
261
262 p1 = vdiv255u16(vmuli16(p1, a1));
263 p0 = vdiv255u16(vmuli16(p0, a0));
264 p0 = vpacki16u8(p0, p1);
265 p0 = vor(p0, fillMask);
266 vstorei128u(dstData, p0);
267
268 dstData += 16;
269 srcData += 16;
270 i -= 4;
271 }
272
273 while (i) {
274 I128 p0;
275 I128 a0;
276
277 p0 = vloadi128_32(srcData);
278 p0 = vpshufb(p0, predicate);
279 p0 = vunpackli8(p0, zero);
280 a0 = vswizi16<3, 3, 3, 3>(p0);
281 p0 = vor(p0, a255);
282 p0 = vdiv255u16(vmuli16(p0, a0));
283 p0 = vpacki16u8(p0, p0);
284 p0 = vor(p0, fillMask);
285 vstorei32(dstData, p0);
286
287 dstData += 4;
288 srcData += 4;
289 i--;
290 }
291
292 dstData = blPixelConverterFillGap(dstData, gap);
293 dstData += dstStride;
294 srcData += srcStride;
295 }
296
297 return BL_SUCCESS;
298}
299
300// ============================================================================
301// [BLPixelConverter - Init (SSSE3)]
302// ============================================================================
303
304static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate24(const BLPixelConverterData::NativeFromExternal& d) noexcept {
305 uint32_t aIndex = 0x80u;
306 uint32_t rIndex = uint32_t(d.shifts[0]) >> 3;
307 uint32_t gIndex = uint32_t(d.shifts[1]) >> 3;
308 uint32_t bIndex = uint32_t(d.shifts[2]) >> 3;
309
310 return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24);
311}
312
313static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate32(const BLPixelConverterData::NativeFromExternal& d) noexcept {
314 uint32_t rIndex = uint32_t(d.shifts[0]) >> 3;
315 uint32_t gIndex = uint32_t(d.shifts[1]) >> 3;
316 uint32_t bIndex = uint32_t(d.shifts[2]) >> 3;
317 uint32_t aIndex = uint32_t(d.shifts[3]) >> 3;
318
319 return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24);
320}
321
322bool blPixelConverterInitNativeFromXRGB_SSSE3(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept {
323 BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
324 if (srcInfo.depth == 24) {
325 // Only BYTE aligned components (888 format).
326 if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED))
327 return false;
328
329 // We expect RGB components in any order, but not alpha.
330 if (d.masks[0] != 0)
331 return false;
332
333 switch (dstFormat) {
334 case BL_FORMAT_XRGB32:
335 case BL_FORMAT_PRGB32:
336 d.simdData[0] = blPixelConverterMakePshufbPredicate24(d);
337 d.simdData[1] = d.simdData[0] + 0x00030303u;
338 d.simdData[2] = d.simdData[0] + 0x00060606u;
339 d.simdData[3] = d.simdData[0] + 0x00090909u;
340
341 self->convertFunc = bl_convert_prgb32_from_rgb24_ssse3;
342 return true;
343
344 default:
345 return false;
346 }
347 }
348 else if (srcInfo.depth == 32) {
349 // Only BYTE aligned components (8888 or X888 formats).
350 if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED))
351 return false;
352
353 // This combination is provided by SSE2 converter and doesn't use PSHUFB.
354 // It's better on machines where PSHUFB is slow like ATOMs.
355 if (d.shifts[0] == 16 && d.shifts[1] == 8 && d.shifts[2] == 0)
356 return false;
357
358 bool isARGB = (srcInfo.flags & BL_FORMAT_FLAG_ALPHA) != 0;
359 bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0;
360
361 switch (dstFormat) {
362 case BL_FORMAT_XRGB32:
363 case BL_FORMAT_PRGB32:
364 d.simdData[0] = blPixelConverterMakePshufbPredicate32(d);
365 d.simdData[1] = d.simdData[0] + 0x04040404u;
366 d.simdData[2] = d.simdData[0] + 0x08080808u;
367 d.simdData[3] = d.simdData[0] + 0x0C0C0C0Cu;
368
369 self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_ssse3
370 : bl_convert_prgb32_from_xrgb32_ssse3;
371 return true;
372
373 default:
374 return false;
375 }
376 }
377
378 return false;
379}
380
381#endif
382