blpixelconverter_avx2.cpp source code [Blend2d/src/blend2d/blpixelconverter_avx2.cpp]

1	// [Blend2D]
2	// 2D Vector Graphics Powered by a JIT Compiler.
3	//
4	// [License]
5	// Zlib - See LICENSE.md file in the package.
6
7	#include "./blapi-build_p.h"
8	#ifdef BL_BUILD_OPT_AVX2
9
10	#include "./blpixelconverter_p.h"
11	#include "./blsimd_p.h"
12	#include "./blsupport_p.h"
13
14	// ============================================================================
15	// [BLPixelConverter - PRGB32 <- XRGB32 (AVX2)]
16	// ============================================================================
17
18	static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_avx2(
19	const BLPixelConverterCore* self,
20	uint8_t* dstData, intptr_t dstStride,
21	const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
22
23	using namespace SIMD;
24	if (!options)
25	options = &blPixelConverterDefaultOptions;
26
27	const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
28	size_t gap = options->gap;
29
30	dstStride -= (w * `4`) + gap;
31	srcStride -= (w * `4`);
32
33	I256 fillMask = vseti256u32(d.fillMask);
34	I256 predicate = vdupli128(vloadi128u(d.simdData));
35
36	for (uint32_t y = h; y != `0`; y--) {
37	uint32_t i = w;
38
39	while (i >= `32`) {
40	I256 p0, p1, p2, p3;
41
42	p0 = vloadi256u(srcData + `0`);
43	p1 = vloadi256u(srcData + `32`);
44	p2 = vloadi256u(srcData + `64`);
45	p3 = vloadi256u(srcData + `96`);
46
47	p0 = vpshufb(p0, predicate);
48	p1 = vpshufb(p1, predicate);
49	p2 = vpshufb(p2, predicate);
50	p3 = vpshufb(p3, predicate);
51
52	p0 = vor(p0, fillMask);
53	p1 = vor(p1, fillMask);
54	p2 = vor(p2, fillMask);
55	p3 = vor(p3, fillMask);
56
57	vstorei256u(dstData + `0`, p0);
58	vstorei256u(dstData + `32`, p1);
59	vstorei256u(dstData + `64`, p2);
60	vstorei256u(dstData + `96`, p3);
61
62	dstData += `128`;
63	srcData += `128`;
64	i -= `32`;
65	}
66
67	while (i >= `8`) {
68	I256 p0;
69
70	p0 = vloadi256u(srcData);
71	p0 = vpshufb(p0, predicate);
72	p0 = vor(p0, fillMask);
73	vstorei256u(dstData, p0);
74
75	dstData += `32`;
76	srcData += `32`;
77	i -= `8`;
78	}
79
80	while (i) {
81	I128 p0;
82
83	p0 = vloadi128_32(srcData);
84	p0 = vpshufb(p0, vcast<I128>(predicate));
85	p0 = vor(p0, vcast<I128>(fillMask));
86	vstorei32(dstData, p0);
87
88	dstData += `4`;
89	srcData += `4`;
90	i--;
91	}
92
93	dstData = blPixelConverterFillGap(dstData, gap);
94	dstData += dstStride;
95	srcData += srcStride;
96	}
97
98	return BL_SUCCESS;
99	}
100
101	// ============================================================================
102	// [BLPixelConverter - PRGB32 <- ARGB32 (AVX2)]
103	// ============================================================================
104
105	static BLResult BL_CDECL bl_convert_prgb32_from_argb32_avx2(
106	const BLPixelConverterCore* self,
107	uint8_t* dstData, intptr_t dstStride,
108	const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
109
110	using namespace SIMD;
111	if (!options)
112	options = &blPixelConverterDefaultOptions;
113
114	const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
115	size_t gap = options->gap;
116
117	dstStride -= (w * `4`) + gap;
118	srcStride -= (w * `4`);
119
120	I256 a255 = vseti256u64(`0x00FF000000000000u`);
121	I256 fillMask = vseti256u32(d.fillMask);
122	I256 predicate = vdupli128(vloadi128u(d.simdData));
123
124	for (uint32_t y = h; y != `0`; y--) {
125	uint32_t i = w;
126
127	while (i >= `16`) {
128	I256 p0, p1, p2, p3;
129
130	p0 = vloadi256u(srcData + `0`);
131	p2 = vloadi256u(srcData + `32`);
132
133	I256 zero = vzeroi256();
134	p0 = vpshufb(p0, predicate);
135	p2 = vpshufb(p2, predicate);
136
137	p1 = vunpackhi8(p0, zero);
138	p0 = vunpackli8(p0, zero);
139	p3 = vunpackhi8(p2, zero);
140	p2 = vunpackli8(p2, zero);
141
142	p0 = vmuli16(vor(p0, a255), vswizi16<`3`, `3`, `3`, `3`>(p0));
143	p1 = vmuli16(vor(p1, a255), vswizi16<`3`, `3`, `3`, `3`>(p1));
144	p2 = vmuli16(vor(p2, a255), vswizi16<`3`, `3`, `3`, `3`>(p2));
145	p3 = vmuli16(vor(p3, a255), vswizi16<`3`, `3`, `3`, `3`>(p3));
146
147	p0 = vdiv255u16(p0);
148	p1 = vdiv255u16(p1);
149	p2 = vdiv255u16(p2);
150	p3 = vdiv255u16(p3);
151
152	p0 = vpacki16u8(p0, p1);
153	p2 = vpacki16u8(p2, p3);
154
155	p0 = vor(p0, fillMask);
156	p2 = vor(p2, fillMask);
157
158	vstorei256u(dstData + `0`, p0);
159	vstorei256u(dstData + `32`, p2);
160
161	dstData += `64`;
162	srcData += `64`;
163	i -= `16`;
164	}
165
166	while (i >= `4`) {
167	I128 p0, p1;
168	I128 a0, a1;
169
170	p0 = vloadi128u(srcData);
171	I128 zero = vzeroi128();
172	p0 = vpshufb(p0, vcast<I128>(predicate));
173
174	p1 = vunpackhi8(p0, zero);
175	p0 = vunpackli8(p0, zero);
176
177	a1 = vswizi16<`3`, `3`, `3`, `3`>(p1);
178	p1 = vor(p1, vcast<I128>(a255));
179
180	a0 = vswizi16<`3`, `3`, `3`, `3`>(p0);
181	p0 = vor(p0, vcast<I128>(a255));
182
183	p1 = vdiv255u16(vmuli16(p1, a1));
184	p0 = vdiv255u16(vmuli16(p0, a0));
185	p0 = vpacki16u8(p0, p1);
186	p0 = vor(p0, vcast<I128>(fillMask));
187	vstorei128u(dstData, p0);
188
189	dstData += `16`;
190	srcData += `16`;
191	i -= `4`;
192	}
193
194	while (i) {
195	I128 p0;
196	I128 a0;
197
198	p0 = vloadi128_32(srcData);
199	I128 zero = vzeroi128();
200
201	p0 = vpshufb(p0, vcast<I128>(predicate));
202	p0 = vunpackli8(p0, zero);
203	a0 = vswizi16<`3`, `3`, `3`, `3`>(p0);
204	p0 = vor(p0, vcast<I128>(a255));
205	p0 = vdiv255u16(vmuli16(p0, a0));
206	p0 = vpacki16u8(p0, p0);
207	p0 = vor(p0, vcast<I128>(fillMask));
208	vstorei32(dstData, p0);
209
210	dstData += `4`;
211	srcData += `4`;
212	i--;
213	}
214
215	dstData = blPixelConverterFillGap(dstData, gap);
216	dstData += dstStride;
217	srcData += srcStride;
218	}
219
220	return BL_SUCCESS;
221	}
222
223	// ============================================================================
224	// [BLPixelConverter - Init (AVX2)]
225	// ============================================================================
226
227	static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate32(const BLPixelConverterData::NativeFromExternal& d) noexcept {
228	uint32_t rIndex = uint32_t(d.shifts[`0`]) >> `3`;
229	uint32_t gIndex = uint32_t(d.shifts[`1`]) >> `3`;
230	uint32_t bIndex = uint32_t(d.shifts[`2`]) >> `3`;
231	uint32_t aIndex = uint32_t(d.shifts[`3`]) >> `3`;
232
233	return (rIndex << `16`) \| (gIndex << `8`) \| (bIndex << `0`) \| (aIndex << `24`);
234	}
235
236	bool blPixelConverterInitNativeFromXRGB_AVX2(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept {
237	BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
238	const BLFormatInfo& dstInfo = blPixelConverterFormatInfo[dstFormat];
239
240	if (srcInfo.depth == `32`) {
241	// Only BYTE aligned components (8888 or X888 formats).
242	if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED))
243	return false;
244
245	bool isARGB = (srcInfo.flags & BL_FORMAT_FLAG_ALPHA) != `0`;
246	bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != `0`;
247
248	switch (dstFormat) {
249	case BL_FORMAT_XRGB32:
250	case BL_FORMAT_PRGB32:
251	d.simdData[`0`] = blPixelConverterMakePshufbPredicate32(d);
252	d.simdData[`1`] = d.simdData[`0`] + `0x04040404u`;
253	d.simdData[`2`] = d.simdData[`0`] + `0x08080808u`;
254	d.simdData[`3`] = d.simdData[`0`] + `0x0C0C0C0Cu`;
255
256	self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_avx2
257	: bl_convert_prgb32_from_xrgb32_avx2;
258	return true;
259
260	default:
261	return false;
262	}
263	}
264
265	return false;
266	}
267
268	#endif
269

Browse the source code of Blend2d/src/blend2d/blpixelconverter_avx2.cpp