1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_BUILD_OPT_AVX2
9
10#include "./blpixelconverter_p.h"
11#include "./blsimd_p.h"
12#include "./blsupport_p.h"
13
14// ============================================================================
15// [BLPixelConverter - PRGB32 <- XRGB32 (AVX2)]
16// ============================================================================
17
18static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_avx2(
19 const BLPixelConverterCore* self,
20 uint8_t* dstData, intptr_t dstStride,
21 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
22
23 using namespace SIMD;
24 if (!options)
25 options = &blPixelConverterDefaultOptions;
26
27 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
28 size_t gap = options->gap;
29
30 dstStride -= (w * 4) + gap;
31 srcStride -= (w * 4);
32
33 I256 fillMask = vseti256u32(d.fillMask);
34 I256 predicate = vdupli128(vloadi128u(d.simdData));
35
36 for (uint32_t y = h; y != 0; y--) {
37 uint32_t i = w;
38
39 while (i >= 32) {
40 I256 p0, p1, p2, p3;
41
42 p0 = vloadi256u(srcData + 0);
43 p1 = vloadi256u(srcData + 32);
44 p2 = vloadi256u(srcData + 64);
45 p3 = vloadi256u(srcData + 96);
46
47 p0 = vpshufb(p0, predicate);
48 p1 = vpshufb(p1, predicate);
49 p2 = vpshufb(p2, predicate);
50 p3 = vpshufb(p3, predicate);
51
52 p0 = vor(p0, fillMask);
53 p1 = vor(p1, fillMask);
54 p2 = vor(p2, fillMask);
55 p3 = vor(p3, fillMask);
56
57 vstorei256u(dstData + 0, p0);
58 vstorei256u(dstData + 32, p1);
59 vstorei256u(dstData + 64, p2);
60 vstorei256u(dstData + 96, p3);
61
62 dstData += 128;
63 srcData += 128;
64 i -= 32;
65 }
66
67 while (i >= 8) {
68 I256 p0;
69
70 p0 = vloadi256u(srcData);
71 p0 = vpshufb(p0, predicate);
72 p0 = vor(p0, fillMask);
73 vstorei256u(dstData, p0);
74
75 dstData += 32;
76 srcData += 32;
77 i -= 8;
78 }
79
80 while (i) {
81 I128 p0;
82
83 p0 = vloadi128_32(srcData);
84 p0 = vpshufb(p0, vcast<I128>(predicate));
85 p0 = vor(p0, vcast<I128>(fillMask));
86 vstorei32(dstData, p0);
87
88 dstData += 4;
89 srcData += 4;
90 i--;
91 }
92
93 dstData = blPixelConverterFillGap(dstData, gap);
94 dstData += dstStride;
95 srcData += srcStride;
96 }
97
98 return BL_SUCCESS;
99}
100
101// ============================================================================
102// [BLPixelConverter - PRGB32 <- ARGB32 (AVX2)]
103// ============================================================================
104
105static BLResult BL_CDECL bl_convert_prgb32_from_argb32_avx2(
106 const BLPixelConverterCore* self,
107 uint8_t* dstData, intptr_t dstStride,
108 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
109
110 using namespace SIMD;
111 if (!options)
112 options = &blPixelConverterDefaultOptions;
113
114 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
115 size_t gap = options->gap;
116
117 dstStride -= (w * 4) + gap;
118 srcStride -= (w * 4);
119
120 I256 a255 = vseti256u64(0x00FF000000000000u);
121 I256 fillMask = vseti256u32(d.fillMask);
122 I256 predicate = vdupli128(vloadi128u(d.simdData));
123
124 for (uint32_t y = h; y != 0; y--) {
125 uint32_t i = w;
126
127 while (i >= 16) {
128 I256 p0, p1, p2, p3;
129
130 p0 = vloadi256u(srcData + 0);
131 p2 = vloadi256u(srcData + 32);
132
133 I256 zero = vzeroi256();
134 p0 = vpshufb(p0, predicate);
135 p2 = vpshufb(p2, predicate);
136
137 p1 = vunpackhi8(p0, zero);
138 p0 = vunpackli8(p0, zero);
139 p3 = vunpackhi8(p2, zero);
140 p2 = vunpackli8(p2, zero);
141
142 p0 = vmuli16(vor(p0, a255), vswizi16<3, 3, 3, 3>(p0));
143 p1 = vmuli16(vor(p1, a255), vswizi16<3, 3, 3, 3>(p1));
144 p2 = vmuli16(vor(p2, a255), vswizi16<3, 3, 3, 3>(p2));
145 p3 = vmuli16(vor(p3, a255), vswizi16<3, 3, 3, 3>(p3));
146
147 p0 = vdiv255u16(p0);
148 p1 = vdiv255u16(p1);
149 p2 = vdiv255u16(p2);
150 p3 = vdiv255u16(p3);
151
152 p0 = vpacki16u8(p0, p1);
153 p2 = vpacki16u8(p2, p3);
154
155 p0 = vor(p0, fillMask);
156 p2 = vor(p2, fillMask);
157
158 vstorei256u(dstData + 0, p0);
159 vstorei256u(dstData + 32, p2);
160
161 dstData += 64;
162 srcData += 64;
163 i -= 16;
164 }
165
166 while (i >= 4) {
167 I128 p0, p1;
168 I128 a0, a1;
169
170 p0 = vloadi128u(srcData);
171 I128 zero = vzeroi128();
172 p0 = vpshufb(p0, vcast<I128>(predicate));
173
174 p1 = vunpackhi8(p0, zero);
175 p0 = vunpackli8(p0, zero);
176
177 a1 = vswizi16<3, 3, 3, 3>(p1);
178 p1 = vor(p1, vcast<I128>(a255));
179
180 a0 = vswizi16<3, 3, 3, 3>(p0);
181 p0 = vor(p0, vcast<I128>(a255));
182
183 p1 = vdiv255u16(vmuli16(p1, a1));
184 p0 = vdiv255u16(vmuli16(p0, a0));
185 p0 = vpacki16u8(p0, p1);
186 p0 = vor(p0, vcast<I128>(fillMask));
187 vstorei128u(dstData, p0);
188
189 dstData += 16;
190 srcData += 16;
191 i -= 4;
192 }
193
194 while (i) {
195 I128 p0;
196 I128 a0;
197
198 p0 = vloadi128_32(srcData);
199 I128 zero = vzeroi128();
200
201 p0 = vpshufb(p0, vcast<I128>(predicate));
202 p0 = vunpackli8(p0, zero);
203 a0 = vswizi16<3, 3, 3, 3>(p0);
204 p0 = vor(p0, vcast<I128>(a255));
205 p0 = vdiv255u16(vmuli16(p0, a0));
206 p0 = vpacki16u8(p0, p0);
207 p0 = vor(p0, vcast<I128>(fillMask));
208 vstorei32(dstData, p0);
209
210 dstData += 4;
211 srcData += 4;
212 i--;
213 }
214
215 dstData = blPixelConverterFillGap(dstData, gap);
216 dstData += dstStride;
217 srcData += srcStride;
218 }
219
220 return BL_SUCCESS;
221}
222
223// ============================================================================
224// [BLPixelConverter - Init (AVX2)]
225// ============================================================================
226
227static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate32(const BLPixelConverterData::NativeFromExternal& d) noexcept {
228 uint32_t rIndex = uint32_t(d.shifts[0]) >> 3;
229 uint32_t gIndex = uint32_t(d.shifts[1]) >> 3;
230 uint32_t bIndex = uint32_t(d.shifts[2]) >> 3;
231 uint32_t aIndex = uint32_t(d.shifts[3]) >> 3;
232
233 return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24);
234}
235
236bool blPixelConverterInitNativeFromXRGB_AVX2(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept {
237 BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
238 const BLFormatInfo& dstInfo = blPixelConverterFormatInfo[dstFormat];
239
240 if (srcInfo.depth == 32) {
241 // Only BYTE aligned components (8888 or X888 formats).
242 if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED))
243 return false;
244
245 bool isARGB = (srcInfo.flags & BL_FORMAT_FLAG_ALPHA) != 0;
246 bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0;
247
248 switch (dstFormat) {
249 case BL_FORMAT_XRGB32:
250 case BL_FORMAT_PRGB32:
251 d.simdData[0] = blPixelConverterMakePshufbPredicate32(d);
252 d.simdData[1] = d.simdData[0] + 0x04040404u;
253 d.simdData[2] = d.simdData[0] + 0x08080808u;
254 d.simdData[3] = d.simdData[0] + 0x0C0C0C0Cu;
255
256 self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_avx2
257 : bl_convert_prgb32_from_xrgb32_avx2;
258 return true;
259
260 default:
261 return false;
262 }
263 }
264
265 return false;
266}
267
268#endif
269