qdrawhelper_avx2.cpp source code [Qt/src/gui/painting/qdrawhelper_avx2.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtGui module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include "qdrawhelper_p.h"
42	#include "qdrawhelper_x86_p.h"
43	#include "qdrawingprimitive_sse2_p.h"
44	#include "qpixellayout_p.h"
45	#include "qrgba64_p.h"
46
47	#if defined(QT_COMPILER_SUPPORTS_AVX2)
48
49	QT_BEGIN_NAMESPACE
50
51	enum {
52	FixedScale = `1` << `16`,
53	HalfPoint = `1` << `15`
54	};
55
56	// Vectorized blend functions:
57
58	// See BYTE_MUL_SSE2 for details.
59	inline static void Q_DECL_VECTORCALL
60	BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
61	{
62	__m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, `8`);
63	__m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
64
65	pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel);
66	pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel);
67
68	pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, `8`));
69	pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, `8`));
70	pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half);
71	pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half);
72
73	pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, `8`);
74	pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
75
76	pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
77	}
78
79	inline static void Q_DECL_VECTORCALL
80	BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
81	{
82	__m256i pixelVectorAG = _mm256_srli_epi32(pixelVector, `16`);
83	__m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
84
85	pixelVectorAG = _mm256_mullo_epi32(pixelVectorAG, alphaChannel);
86	pixelVectorRB = _mm256_mullo_epi32(pixelVectorRB, alphaChannel);
87
88	pixelVectorRB = _mm256_add_epi32(pixelVectorRB, _mm256_srli_epi32(pixelVectorRB, `16`));
89	pixelVectorAG = _mm256_add_epi32(pixelVectorAG, _mm256_srli_epi32(pixelVectorAG, `16`));
90	pixelVectorRB = _mm256_add_epi32(pixelVectorRB, half);
91	pixelVectorAG = _mm256_add_epi32(pixelVectorAG, half);
92
93	pixelVectorRB = _mm256_srli_epi32(pixelVectorRB, `16`);
94	pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
95
96	pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
97	}
98
99	// See INTERPOLATE_PIXEL_255_SSE2 for details.
100	inline static void Q_DECL_VECTORCALL
101	INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
102	{
103	const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, `8`);
104	const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, `8`);
105	const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
106	const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
107	const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel);
108	const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel);
109	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel);
110	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel);
111	__m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
112	__m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
113	finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, `8`));
114	finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, `8`));
115	finalAG = _mm256_add_epi16(finalAG, half);
116	finalRB = _mm256_add_epi16(finalRB, half);
117	finalAG = _mm256_andnot_si256(colorMask, finalAG);
118	finalRB = _mm256_srli_epi16(finalRB, `8`);
119
120	dstVector = _mm256_or_si256(finalAG, finalRB);
121	}
122
123	inline static void Q_DECL_VECTORCALL
124	INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
125	{
126	const __m256i srcVectorAG = _mm256_srli_epi32(srcVector, `16`);
127	const __m256i dstVectorAG = _mm256_srli_epi32(dstVector, `16`);
128	const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
129	const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
130	const __m256i srcVectorAGalpha = _mm256_mullo_epi32(srcVectorAG, alphaChannel);
131	const __m256i srcVectorRBalpha = _mm256_mullo_epi32(srcVectorRB, alphaChannel);
132	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(dstVectorAG, oneMinusAlphaChannel);
133	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(dstVectorRB, oneMinusAlphaChannel);
134	__m256i finalAG = _mm256_add_epi32(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
135	__m256i finalRB = _mm256_add_epi32(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
136	finalAG = _mm256_add_epi32(finalAG, _mm256_srli_epi32(finalAG, `16`));
137	finalRB = _mm256_add_epi32(finalRB, _mm256_srli_epi32(finalRB, `16`));
138	finalAG = _mm256_add_epi32(finalAG, half);
139	finalRB = _mm256_add_epi32(finalRB, half);
140	finalAG = _mm256_andnot_si256(colorMask, finalAG);
141	finalRB = _mm256_srli_epi32(finalRB, `16`);
142
143	dstVector = _mm256_or_si256(finalAG, finalRB);
144	}
145
146
147	// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
148	inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 dst, const* quint32 src, const* int length)
149	{
150	const __m256i half = _mm256_set1_epi16(`0x80`);
151	const __m256i one = _mm256_set1_epi16(`0xff`);
152	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
153	const __m256i alphaMask = _mm256_set1_epi32(`0xff000000`);
154	const __m256i offsetMask = _mm256_setr_epi32(`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`);
155	const __m256i alphaShuffleMask = _mm256_set_epi8(char(`0xff`),`15`,char(`0xff`),`15`,char(`0xff`),`11`,char(`0xff`),`11`,char(`0xff`),`7`,char(`0xff`),`7`,char(`0xff`),`3`,char(`0xff`),`3`,
156	char(`0xff`),`15`,char(`0xff`),`15`,char(`0xff`),`11`,char(`0xff`),`11`,char(`0xff`),`7`,char(`0xff`),`7`,char(`0xff`),`3`,char(`0xff`),`3`);
157
158	const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> `2`) & `0x7`;
159
160	int x = `0`;
161	// Prologue to handle all pixels until dst is 32-byte aligned in one step.
162	if (minusOffsetToAlignDstOn32Bytes != `0` && x < (length - `7`)) {
163	const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - `1`), offsetMask);
164	const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
165	const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask);
166	if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) {
167	if (_mm256_testc_si256(srcVector, prologueAlphaMask)) {
168	_mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector);
169	} else {
170	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
171	alphaChannel = _mm256_sub_epi16(one, alphaChannel);
172	__m256i dstVector = _mm256_maskload_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
173	BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
174	dstVector = _mm256_add_epi8(dstVector, srcVector);
175	_mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector);
176	}
177	}
178	x += (`8` - minusOffsetToAlignDstOn32Bytes);
179	}
180
181	for (; x < (length - `7`); x += `8`) {
182	const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
183	if (!_mm256_testz_si256(srcVector, alphaMask)) {
184	if (_mm256_testc_si256(srcVector, alphaMask)) {
185	_mm256_store_si256((__m256i *)&dst[x], srcVector);
186	} else {
187	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
188	alphaChannel = _mm256_sub_epi16(one, alphaChannel);
189	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
190	BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
191	dstVector = _mm256_add_epi8(dstVector, srcVector);
192	_mm256_store_si256((__m256i *)&dst[x], dstVector);
193	}
194	}
195	}
196
197	// Epilogue to handle all remaining pixels in one step.
198	if (x < length) {
199	const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(x - length));
200	const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x], epilogueMask);
201	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
202	if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
203	if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) {
204	_mm256_maskstore_epi32((int *)&dst[x], epilogueMask, srcVector);
205	} else {
206	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
207	alphaChannel = _mm256_sub_epi16(one, alphaChannel);
208	__m256i dstVector = _mm256_maskload_epi32((int *)&dst[x], epilogueMask);
209	BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
210	dstVector = _mm256_add_epi8(dstVector, srcVector);
211	_mm256_maskstore_epi32((int *)&dst[x], epilogueMask, dstVector);
212	}
213	}
214	}
215	}
216
217
218	// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
219	inline static void Q_DECL_VECTORCALL
220	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 dst, const* quint32 src, const* int length, const int const_alpha)
221	{
222	int x = `0`;
223
224	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
225	blend_pixel(dst[x], src[x], const_alpha);
226
227	const __m256i half = _mm256_set1_epi16(`0x80`);
228	const __m256i one = _mm256_set1_epi16(`0xff`);
229	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
230	const __m256i alphaMask = _mm256_set1_epi32(`0xff000000`);
231	const __m256i alphaShuffleMask = _mm256_set_epi8(char(`0xff`),`15`,char(`0xff`),`15`,char(`0xff`),`11`,char(`0xff`),`11`,char(`0xff`),`7`,char(`0xff`),`7`,char(`0xff`),`3`,char(`0xff`),`3`,
232	char(`0xff`),`15`,char(`0xff`),`15`,char(`0xff`),`11`,char(`0xff`),`11`,char(`0xff`),`7`,char(`0xff`),`7`,char(`0xff`),`3`,char(`0xff`),`3`);
233	const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
234	for (; x < (length - `7`); x += `8`) {
235	__m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
236	if (!_mm256_testz_si256(srcVector, alphaMask)) {
237	BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half);
238
239	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
240	alphaChannel = _mm256_sub_epi16(one, alphaChannel);
241	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
242	BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
243	dstVector = _mm256_add_epi8(dstVector, srcVector);
244	_mm256_store_si256((__m256i *)&dst[x], dstVector);
245	}
246	}
247	SIMD_EPILOGUE(x, length, `7`)
248	blend_pixel(dst[x], src[x], const_alpha);
249	}
250
251	void qt_blend_argb32_on_argb32_avx2(uchar destPixels, int* dbpl,
252	const uchar srcPixels, int* sbpl,
253	int w, int h,
254	int const_alpha)
255	{
256	if (const_alpha == `256`) {
257	for (int y = `0`; y < h; ++y) {
258	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
259	quint32 dst = reinterpret_cast<quint32 >(destPixels);
260	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, w);
261	destPixels += dbpl;
262	srcPixels += sbpl;
263	}
264	} else if (const_alpha != `0`) {
265	const_alpha = (const_alpha * `255`) >> `8`;
266	for (int y = `0`; y < h; ++y) {
267	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
268	quint32 dst = reinterpret_cast<quint32 >(destPixels);
269	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, w, const_alpha);
270	destPixels += dbpl;
271	srcPixels += sbpl;
272	}
273	}
274	}
275
276	void qt_blend_rgb32_on_rgb32_avx2(uchar destPixels, int* dbpl,
277	const uchar srcPixels, int* sbpl,
278	int w, int h,
279	int const_alpha)
280	{
281	if (const_alpha == `256`) {
282	for (int y = `0`; y < h; ++y) {
283	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
284	quint32 dst = reinterpret_cast<quint32 >(destPixels);
285	::memcpy(dst, src, w * sizeof(uint));
286	srcPixels += sbpl;
287	destPixels += dbpl;
288	}
289	return;
290	}
291	if (const_alpha == `0`)
292	return;
293
294	const __m256i half = _mm256_set1_epi16(`0x80`);
295	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
296
297	const_alpha = (const_alpha * `255`) >> `8`;
298	int one_minus_const_alpha = `255` - const_alpha;
299	const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
300	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha);
301	for (int y = `0`; y < h; ++y) {
302	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
303	quint32 dst = reinterpret_cast<quint32 >(destPixels);
304	int x = `0`;
305
306	// First, align dest to 32 bytes:
307	ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
308	dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
309
310	// 2) interpolate pixels with AVX2
311	for (; x < (w - `7`); x += `8`) {
312	const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
313	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
314	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
315	_mm256_store_si256((__m256i *)&dst[x], dstVector);
316	}
317
318	// 3) Epilogue
319	SIMD_EPILOGUE(x, w, `7`)
320	dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
321
322	srcPixels += sbpl;
323	destPixels += dbpl;
324	}
325	}
326
327	static Q_NEVER_INLINE
328	void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes)
329	{
330	__m128i value128 = _mm256_castsi256_si128(value256);
331
332	// main body
333	__m256i dst256 = reinterpret_cast<__m256i >(dest);
334	uchar *end = dest + bytes;
335	while (reinterpret_cast<uchar *>(dst256 + `4`) <= end) {
336	_mm256_storeu_si256(dst256 + `0`, value256);
337	_mm256_storeu_si256(dst256 + `1`, value256);
338	_mm256_storeu_si256(dst256 + `2`, value256);
339	_mm256_storeu_si256(dst256 + `3`, value256);
340	dst256 += `4`;
341	}
342
343	// first epilogue: fewer than 128 bytes / 32 entries
344	bytes = end - reinterpret_cast<uchar *>(dst256);
345	switch (bytes / sizeof(value256)) {
346	case `3`: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH();
347	case `2`: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH();
348	case `1`: _mm256_storeu_si256(dst256++, value256);
349	}
350
351	// second epilogue: fewer than 32 bytes
352	__m128i dst128 = reinterpret_cast<__m128i >(dst256);
353	if (bytes & sizeof(value128))
354	_mm_storeu_si128(dst128++, value128);
355
356	// third epilogue: fewer than 16 bytes
357	if (bytes & `8`)
358	_mm_storel_epi64(reinterpret_cast<__m128i *>(end - `8`), value128);
359	}
360
361	void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
362	{
363	#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && !defined(Q_CC_INTEL)
364	// work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
365	__m128i value64 = _mm_set_epi64x(`0`, value); // _mm_cvtsi64_si128(value);
366	# ifdef Q_PROCESSOR_X86_64
367	asm ("" : "+x" (value64));
368	# endif
369	__m256i value256 = _mm256_broadcastq_epi64(value64);
370	#else
371	__m256i value256 = _mm256_set1_epi64x(value);
372	#endif
373
374	qt_memfillXX_avx2(reinterpret_cast<uchar >(dest), value256, count sizeof(quint64));
375	}
376
377	void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)
378	{
379	if (count % `2`) {
380	// odd number of pixels, round to even
381	*dest++ = value;
382	--count;
383	}
384	qt_memfillXX_avx2(reinterpret_cast<uchar >(dest), _mm256_set1_epi32(value), count sizeof(quint32));
385	}
386
387	void QT_FASTCALL comp_func_SourceOver_avx2(uint destPixels, const* uint srcPixels, int* length, uint const_alpha)
388	{
389	Q_ASSERT(const_alpha < `256`);
390
391	const quint32 src = (const* quint32 *) srcPixels;
392	quint32 dst = (quint32 ) destPixels;
393
394	if (const_alpha == `255`)
395	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
396	else
397	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
398	}
399
400	#if QT_CONFIG(raster_64bit)
401	void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
402	{
403	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
404	const __m256i half = _mm256_set1_epi32(`0x8000`);
405	const __m256i one = _mm256_set1_epi32(`0xffff`);
406	const __m256i colorMask = _mm256_set1_epi32(`0x0000ffff`);
407	__m256i alphaMask = _mm256_set1_epi32(`0xff000000`);
408	alphaMask = _mm256_unpacklo_epi8(alphaMask, alphaMask);
409	const __m256i alphaShuffleMask = _mm256_set_epi8(char(`0xff`),char(`0xff`),`15`,`14`,char(`0xff`),char(`0xff`),`15`,`14`,char(`0xff`),char(`0xff`),`7`,`6`,char(`0xff`),char(`0xff`),`7`,`6`,
410	char(`0xff`),char(`0xff`),`15`,`14`,char(`0xff`),char(`0xff`),`15`,`14`,char(`0xff`),char(`0xff`),`7`,`6`,char(`0xff`),char(`0xff`),`7`,`6`);
411
412	if (const_alpha == `255`) {
413	int x = `0`;
414	for (; x < length && (quintptr(dst + x) & `31`); ++x)
415	blend_pixel(dst[x], src[x]);
416	for (; x < length - `3`; x += `4`) {
417	const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
418	if (!_mm256_testz_si256(srcVector, alphaMask)) {
419	// Not all transparent
420	if (_mm256_testc_si256(srcVector, alphaMask)) {
421	// All opaque
422	_mm256_store_si256((__m256i *)&dst[x], srcVector);
423	} else {
424	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
425	alphaChannel = _mm256_sub_epi32(one, alphaChannel);
426	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
427	BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
428	dstVector = _mm256_add_epi16(dstVector, srcVector);
429	_mm256_store_si256((__m256i *)&dst[x], dstVector);
430	}
431	}
432	}
433	SIMD_EPILOGUE(x, length, `3`)
434	blend_pixel(dst[x], src[x]);
435	} else {
436	const __m256i constAlphaVector = _mm256_set1_epi32(const_alpha \| (const_alpha << `8`));
437	int x = `0`;
438	for (; x < length && (quintptr(dst + x) & `31`); ++x)
439	blend_pixel(dst[x], src[x], const_alpha);
440	for (; x < length - `3`; x += `4`) {
441	__m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
442	if (!_mm256_testz_si256(srcVector, alphaMask)) {
443	// Not all transparent
444	BYTE_MUL_RGB64_AVX2(srcVector, constAlphaVector, colorMask, half);
445
446	__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
447	alphaChannel = _mm256_sub_epi32(one, alphaChannel);
448	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
449	BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
450	dstVector = _mm256_add_epi16(dstVector, srcVector);
451	_mm256_store_si256((__m256i *)&dst[x], dstVector);
452	}
453	}
454	SIMD_EPILOGUE(x, length, `3`)
455	blend_pixel(dst[x], src[x], const_alpha);
456	}
457	}
458	#endif
459
460	void QT_FASTCALL comp_func_Source_avx2(uint dst, const* uint src, int* length, uint const_alpha)
461	{
462	if (const_alpha == `255`) {
463	::memcpy(dst, src, length * sizeof(uint));
464	} else {
465	const int ialpha = `255` - const_alpha;
466
467	int x = `0`;
468
469	// 1) prologue, align on 32 bytes
470	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
471	dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
472
473	// 2) interpolate pixels with AVX2
474	const __m256i half = _mm256_set1_epi16(`0x80`);
475	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
476	const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
477	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha);
478	for (; x < length - `7`; x += `8`) {
479	const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
480	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
481	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
482	_mm256_store_si256((__m256i *)&dst[x], dstVector);
483	}
484
485	// 3) Epilogue
486	SIMD_EPILOGUE(x, length, `7`)
487	dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
488	}
489	}
490
491	#if QT_CONFIG(raster_64bit)
492	void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
493	{
494	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
495	if (const_alpha == `255`) {
496	::memcpy(dst, src, length * sizeof(QRgba64));
497	} else {
498	const uint ca = const_alpha \| (const_alpha << `8`); // adjust to [0-65535]
499	const uint cia = `65535` - ca;
500
501	int x = `0`;
502
503	// 1) prologue, align on 32 bytes
504	for (; x < length && (quintptr(dst + x) & `31`); ++x)
505	dst[x] = interpolate65535(src[x], ca, dst[x], cia);
506
507	// 2) interpolate pixels with AVX2
508	const __m256i half = _mm256_set1_epi32(`0x8000`);
509	const __m256i colorMask = _mm256_set1_epi32(`0x0000ffff`);
510	const __m256i constAlphaVector = _mm256_set1_epi32(ca);
511	const __m256i oneMinusConstAlpha = _mm256_set1_epi32(cia);
512	for (; x < length - `3`; x += `4`) {
513	const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
514	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
515	INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
516	_mm256_store_si256((__m256i *)&dst[x], dstVector);
517	}
518
519	// 3) Epilogue
520	SIMD_EPILOGUE(x, length, `3`)
521	dst[x] = interpolate65535(src[x], ca, dst[x], cia);
522	}
523	}
524	#endif
525
526	void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint destPixels, int* length, uint color, uint const_alpha)
527	{
528	if ((const_alpha & qAlpha(color)) == `255`) {
529	qt_memfill32(destPixels, color, length);
530	} else {
531	if (const_alpha != `255`)
532	color = BYTE_MUL(color, const_alpha);
533
534	const quint32 minusAlphaOfColor = qAlpha(~color);
535	int x = `0`;
536
537	quint32 dst = (quint32 ) destPixels;
538	const __m256i colorVector = _mm256_set1_epi32(color);
539	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
540	const __m256i half = _mm256_set1_epi16(`0x80`);
541	const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor);
542
543	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
544	destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
545
546	for (; x < length - `7`; x += `8`) {
547	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
548	BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
549	dstVector = _mm256_add_epi8(colorVector, dstVector);
550	_mm256_store_si256((__m256i *)&dst[x], dstVector);
551	}
552	SIMD_EPILOGUE(x, length, `7`)
553	destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
554	}
555	}
556
557	#if QT_CONFIG(raster_64bit)
558	void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 destPixels, int* length, QRgba64 color, uint const_alpha)
559	{
560	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
561	if (const_alpha == `255` && color.isOpaque()) {
562	qt_memfill64((quint64*)destPixels, color, length);
563	} else {
564	if (const_alpha != `255`)
565	color = multiplyAlpha255(color, const_alpha);
566
567	const uint minusAlphaOfColor = `65535` - color.alpha();
568	int x = `0`;
569	quint64 dst = (quint64 ) destPixels;
570	const __m256i colorVector = _mm256_set1_epi64x(color);
571	const __m256i colorMask = _mm256_set1_epi32(`0x0000ffff`);
572	const __m256i half = _mm256_set1_epi32(`0x8000`);
573	const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(minusAlphaOfColor);
574
575	for (; x < length && (quintptr(dst + x) & `31`); ++x)
576	destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
577
578	for (; x < length - `3`; x += `4`) {
579	__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
580	BYTE_MUL_RGB64_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
581	dstVector = _mm256_add_epi16(colorVector, dstVector);
582	_mm256_store_si256((__m256i *)&dst[x], dstVector);
583	}
584	SIMD_EPILOGUE(x, length, `3`)
585	destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
586	}
587	}
588	#endif
589
590	#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
591	{ \
592	/* Correct for later unpack */ \
593	const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
594	const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
595	\
596	__m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
597	const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
598	const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
599	__m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
600	__m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
601	__m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
602	\
603	__m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
604	__m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
605	__m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
606	__m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
607	__m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
608	__m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
609	__m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
610	__m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
611	\
612	__m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
613	__m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
614	tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
615	tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
616	tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
617	tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
618	__m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
619	__m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
620	blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
621	blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
622	blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
623	blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
624	\
625	/* Add the values, and shift to only keep 8 significant bits per colors */ \
626	__m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
627	__m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
628	__m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
629	__m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
630	__m256i rAG = _mm256_add_epi16(topAG, botAG); \
631	__m256i rRB = _mm256_add_epi16(topRB, botRB); \
632	rRB = _mm256_srli_epi16(rRB, 8); \
633	/* Correct for hadd */ \
634	rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
635	rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
636	_mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
637	}
638
639	inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
640	{
641	if (v1 < l1)
642	v2 = v1 = l1;
643	else if (v1 >= l2)
644	v2 = v1 = l2;
645	else
646	v2 = v1 + `1`;
647	Q_ASSERT(v1 >= l1 && v1 <= l2);
648	Q_ASSERT(v2 >= l1 && v2 <= l2);
649	}
650
651	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx);
652
653	void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2(uint b, uint end, const QTextureData &image,
654	int &fx, int &fy, int fdx, int /fdy/)
655	{
656	int y1 = (fy >> `16`);
657	int y2;
658	fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - `1`, y1, y2);
659	const uint s1 = (const* uint *)image.scanLine(y1);
660	const uint s2 = (const* uint *)image.scanLine(y2);
661
662	const int disty = (fy & `0x0000ffff`) >> `8`;
663	const int idisty = `256` - disty;
664	const int length = end - b;
665
666	// The intermediate buffer is generated in the positive direction
667	const int adjust = (fdx < `0`) ? fdx * length : `0`;
668	const int offset = (fx + adjust) >> `16`;
669	int x = offset;
670
671	IntermediateBuffer intermediate;
672	// count is the size used in the intermediate_buffer.
673	int count = (qint64(length) * qAbs(fdx) + FixedScale - `1`) / FixedScale + `2`;
674	// length is supposed to be <= BufferSize either because data->m11 < 1 or
675	// data->m11 < 2, and any larger buffers split
676	Q_ASSERT(count <= BufferSize + `2`);
677	int f = `0`;
678	int lim = qMin(count, image.x2 - x);
679	if (x < image.x1) {
680	Q_ASSERT(x < image.x2);
681	uint t = s1[image.x1];
682	uint b = s2[image.x1];
683	quint32 rb = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
684	quint32 ag = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
685	do {
686	intermediate.buffer_rb[f] = rb;
687	intermediate.buffer_ag[f] = ag;
688	f++;
689	x++;
690	} while (x < image.x1 && f < lim);
691	}
692
693	const __m256i disty_ = _mm256_set1_epi16(disty);
694	const __m256i idisty_ = _mm256_set1_epi16(idisty);
695	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
696
697	lim -= `7`;
698	for (; f < lim; x += `8`, f += `8`) {
699	// Load 8 pixels from s1, and split the alpha-green and red-blue component
700	__m256i top = _mm256_loadu_si256((const __m256i)((const* uint *)(s1)+x));
701	__m256i topAG = _mm256_srli_epi16(top, `8`);
702	__m256i topRB = _mm256_and_si256(top, colorMask);
703	// Multiplies each color component by idisty
704	topAG = _mm256_mullo_epi16 (topAG, idisty_);
705	topRB = _mm256_mullo_epi16 (topRB, idisty_);
706
707	// Same for the s2 vector
708	__m256i bottom = _mm256_loadu_si256((const __m256i)((const* uint *)(s2)+x));
709	__m256i bottomAG = _mm256_srli_epi16(bottom, `8`);
710	__m256i bottomRB = _mm256_and_si256(bottom, colorMask);
711	bottomAG = _mm256_mullo_epi16 (bottomAG, disty_);
712	bottomRB = _mm256_mullo_epi16 (bottomRB, disty_);
713
714	// Add the values, and shift to only keep 8 significant bits per colors
715	__m256i rAG =_mm256_add_epi16(topAG, bottomAG);
716	rAG = _mm256_srli_epi16(rAG, `8`);
717	_mm256_storeu_si256((__m256i*)(&intermediate.buffer_ag[f]), rAG);
718	__m256i rRB =_mm256_add_epi16(topRB, bottomRB);
719	rRB = _mm256_srli_epi16(rRB, `8`);
720	_mm256_storeu_si256((__m256i*)(&intermediate.buffer_rb[f]), rRB);
721	}
722
723	for (; f < count; f++) { // Same as above but without simd
724	x = qMin(x, image.x2 - `1`);
725
726	uint t = s1[x];
727	uint b = s2[x];
728
729	intermediate.buffer_rb[f] = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
730	intermediate.buffer_ag[f] = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
731	x++;
732	}
733
734	// Now interpolate the values from the intermediate_buffer to get the final result.
735	intermediate_adder_avx2(b, end, intermediate, offset, fx, fdx);
736	}
737
738	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx)
739	{
740	fx -= offset * FixedScale;
741
742	const __m128i v_fdx = _mm_set1_epi32(fdx * `4`);
743	const __m128i v_blend = _mm_set1_epi32(`0x00800080`);
744	const __m128i vdx_shuffle = _mm_set_epi8(char(`0x80`), `13`, char(`0x80`), `13`, char(`0x80`), `9`, char(`0x80`), `9`,
745	char(`0x80`), `5`, char(`0x80`), `5`, char(`0x80`), `1`, char(`0x80`), `1`);
746	__m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
747
748	while (b < end - `3`) {
749	const __m128i offset = _mm_srli_epi32(v_fx, `16`);
750	__m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate.buffer_rb, offset, `4`);
751	__m256i vag = _mm256_i32gather_epi64((const long long *)intermediate.buffer_ag, offset, `4`);
752
753	__m128i vdx = _mm_shuffle_epi8(v_fx, vdx_shuffle);
754	__m128i vidx = _mm_sub_epi16(_mm_set1_epi16(`256`), vdx);
755	__m256i vmulx = _mm256_castsi128_si256(_mm_unpacklo_epi32(vidx, vdx));
756	vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), `1`);
757
758	vrb = _mm256_mullo_epi16(vrb, vmulx);
759	vag = _mm256_mullo_epi16(vag, vmulx);
760
761	__m256i vrbag = _mm256_hadd_epi32(vrb, vag);
762	vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
763
764	__m128i rb = _mm256_castsi256_si128(vrbag);
765	__m128i ag = _mm256_extracti128_si256(vrbag, `1`);
766	rb = _mm_srli_epi16(rb, `8`);
767
768	_mm_storeu_si128((__m128i*)b, _mm_blendv_epi8(ag, rb, v_blend));
769
770	b += `4`;
771	v_fx = _mm_add_epi32(v_fx, v_fdx);
772	}
773	fx = _mm_cvtsi128_si32(v_fx);
774	while (b < end) {
775	const int x = (fx >> `16`);
776
777	const uint distx = (fx & `0x0000ffff`) >> `8`;
778	const uint idistx = `256` - distx;
779	const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + `1`] * distx) & `0xff00ff00`;
780	const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + `1`] * distx) & `0xff00ff00`;
781	*b = (rb >> `8`) \| ag;
782	b++;
783	fx += fdx;
784	}
785	fx += offset * FixedScale;
786	}
787
788	void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint b, uint end, const QTextureData &image,
789	int &fx, int &fy, int fdx, int /fdy/)
790	{
791	int y1 = (fy >> `16`);
792	int y2;
793	fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - `1`, y1, y2);
794	const uint s1 = (const* uint *)image.scanLine(y1);
795	const uint s2 = (const* uint *)image.scanLine(y2);
796	const int disty8 = (fy & `0x0000ffff`) >> `8`;
797	const int disty4 = (disty8 + `0x08`) >> `4`;
798
799	const qint64 min_fx = qint64(image.x1) * FixedScale;
800	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
801	while (b < end) {
802	int x1 = (fx >> `16`);
803	int x2;
804	fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - `1`, x1, x2);
805	if (x1 != x2)
806	break;
807	uint top = s1[x1];
808	uint bot = s2[x1];
809	*b = INTERPOLATE_PIXEL_256(top, `256` - disty8, bot, disty8);
810	fx += fdx;
811	++b;
812	}
813	uint *boundedEnd = end;
814	if (fdx > `0`)
815	boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
816	else if (fdx < `0`)
817	boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
818
819	// A fast middle part without boundary checks
820	const __m256i vdistShuffle =
821	_mm256_setr_epi8(`0`, char(`0x80`), `0`, char(`0x80`), `4`, char(`0x80`), `4`, char(`0x80`), `8`, char(`0x80`), `8`, char(`0x80`), `12`, char(`0x80`), `12`, char(`0x80`),
822	`0`, char(`0x80`), `0`, char(`0x80`), `4`, char(`0x80`), `4`, char(`0x80`), `8`, char(`0x80`), `8`, char(`0x80`), `12`, char(`0x80`), `12`, char(`0x80`));
823	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
824	const __m256i v_256 = _mm256_set1_epi16(`256`);
825	const __m256i v_disty = _mm256_set1_epi16(disty4);
826	const __m256i v_fdx = _mm256_set1_epi32(fdx * `8`);
827	const __m256i v_fx_r = _mm256_set1_epi32(`0x08`);
828	const __m256i v_index = _mm256_setr_epi32(`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`);
829	__m256i v_fx = _mm256_set1_epi32(fx);
830	v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
831
832	while (b < boundedEnd - `7`) {
833	const __m256i offset = _mm256_srli_epi32(v_fx, `16`);
834	const __m128i offsetLo = _mm256_castsi256_si128(offset);
835	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
836	const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, `4`);
837	const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, `4`);
838	const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, `4`);
839	const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, `4`);
840
841	__m256i v_distx = _mm256_srli_epi16(v_fx, `8`);
842	v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fx_r), `4`);
843	v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
844
845	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
846	b += `8`;
847	v_fx = _mm256_add_epi32(v_fx, v_fdx);
848	}
849	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
850
851	while (b < boundedEnd) {
852	int x = (fx >> `16`);
853	int distx8 = (fx & `0x0000ffff`) >> `8`;
854	*b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
855	fx += fdx;
856	++b;
857	}
858
859	while (b < end) {
860	int x1 = (fx >> `16`);
861	int x2;
862	fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - `1`, x1, x2);
863	uint tl = s1[x1];
864	uint tr = s1[x2];
865	uint bl = s2[x1];
866	uint br = s2[x2];
867	int distx8 = (fx & `0x0000ffff`) >> `8`;
868	*b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
869	fx += fdx;
870	++b;
871	}
872	}
873
874	void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint b, uint end, const QTextureData &image,
875	int &fx, int &fy, int fdx, int fdy)
876	{
877	const qint64 min_fx = qint64(image.x1) * FixedScale;
878	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
879	const qint64 min_fy = qint64(image.y1) * FixedScale;
880	const qint64 max_fy = qint64(image.y2 - `1`) * FixedScale;
881	// first handle the possibly bounded part in the beginning
882	while (b < end) {
883	int x1 = (fx >> `16`);
884	int x2;
885	int y1 = (fy >> `16`);
886	int y2;
887	fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - `1`, x1, x2);
888	fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - `1`, y1, y2);
889	if (x1 != x2 && y1 != y2)
890	break;
891	const uint s1 = (const* uint *)image.scanLine(y1);
892	const uint s2 = (const* uint *)image.scanLine(y2);
893	uint tl = s1[x1];
894	uint tr = s1[x2];
895	uint bl = s2[x1];
896	uint br = s2[x2];
897	int distx = (fx & `0x0000ffff`) >> `8`;
898	int disty = (fy & `0x0000ffff`) >> `8`;
899	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
900	fx += fdx;
901	fy += fdy;
902	++b;
903	}
904	uint *boundedEnd = end;
905	if (fdx > `0`)
906	boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
907	else if (fdx < `0`)
908	boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
909	if (fdy > `0`)
910	boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy);
911	else if (fdy < `0`)
912	boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy);
913
914	// until boundedEnd we can now have a fast middle part without boundary checks
915	const __m256i vdistShuffle =
916	_mm256_setr_epi8(`0`, char(`0x80`), `0`, char(`0x80`), `4`, char(`0x80`), `4`, char(`0x80`), `8`, char(`0x80`), `8`, char(`0x80`), `12`, char(`0x80`), `12`, char(`0x80`),
917	`0`, char(`0x80`), `0`, char(`0x80`), `4`, char(`0x80`), `4`, char(`0x80`), `8`, char(`0x80`), `8`, char(`0x80`), `12`, char(`0x80`), `12`, char(`0x80`));
918	const __m256i colorMask = _mm256_set1_epi32(`0x00ff00ff`);
919	const __m256i v_256 = _mm256_set1_epi16(`256`);
920	const __m256i v_fdx = _mm256_set1_epi32(fdx * `8`);
921	const __m256i v_fdy = _mm256_set1_epi32(fdy * `8`);
922	const __m256i v_fxy_r = _mm256_set1_epi32(`0x08`);
923	const __m256i v_index = _mm256_setr_epi32(`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`);
924	__m256i v_fx = _mm256_set1_epi32(fx);
925	__m256i v_fy = _mm256_set1_epi32(fy);
926	v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
927	v_fy = _mm256_add_epi32(v_fy, _mm256_mullo_epi32(_mm256_set1_epi32(fdy), v_index));
928
929	const uchar *textureData = image.imageData;
930	const qsizetype bytesPerLine = image.bytesPerLine;
931	const __m256i vbpl = _mm256_set1_epi16(bytesPerLine/`4`);
932
933	while (b < boundedEnd - `7`) {
934	const __m256i vy = _mm256_packs_epi32(_mm256_srli_epi32(v_fy, `16`), _mm256_setzero_si256());
935	// 8x16bit 8x16bit -> 8x32bit*
936	__m256i offset = _mm256_unpacklo_epi16(_mm256_mullo_epi16(vy, vbpl), _mm256_mulhi_epi16(vy, vbpl));
937	offset = _mm256_add_epi32(offset, _mm256_srli_epi32(v_fx, `16`));
938	const __m128i offsetLo = _mm256_castsi256_si128(offset);
939	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
940	const uint topData = (const* uint *)(textureData);
941	const uint botData = (const* uint *)(textureData + bytesPerLine);
942	const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, `4`);
943	const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, `4`);
944	const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, `4`);
945	const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, `4`);
946
947	__m256i v_distx = _mm256_srli_epi16(v_fx, `8`);
948	__m256i v_disty = _mm256_srli_epi16(v_fy, `8`);
949	v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fxy_r), `4`);
950	v_disty = _mm256_srli_epi16(_mm256_add_epi32(v_disty, v_fxy_r), `4`);
951	v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
952	v_disty = _mm256_shuffle_epi8(v_disty, vdistShuffle);
953
954	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
955	b += `8`;
956	v_fx = _mm256_add_epi32(v_fx, v_fdx);
957	v_fy = _mm256_add_epi32(v_fy, v_fdy);
958	}
959	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
960	fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , `0`);
961
962	while (b < boundedEnd) {
963	int x = (fx >> `16`);
964	int y = (fy >> `16`);
965
966	const uint s1 = (const* uint *)image.scanLine(y);
967	const uint s2 = (const* uint *)image.scanLine(y + `1`);
968
969	int distx = (fx & `0x0000ffff`) >> `8`;
970	int disty = (fy & `0x0000ffff`) >> `8`;
971	*b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
972
973	fx += fdx;
974	fy += fdy;
975	++b;
976	}
977
978	while (b < end) {
979	int x1 = (fx >> `16`);
980	int x2;
981	int y1 = (fy >> `16`);
982	int y2;
983
984	fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - `1`, x1, x2);
985	fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - `1`, y1, y2);
986
987	const uint s1 = (const* uint *)image.scanLine(y1);
988	const uint s2 = (const* uint *)image.scanLine(y2);
989
990	uint tl = s1[x1];
991	uint tr = s1[x2];
992	uint bl = s2[x1];
993	uint br = s2[x2];
994
995	int distx = (fx & `0x0000ffff`) >> `8`;
996	int disty = (fy & `0x0000ffff`) >> `8`;
997	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
998
999	fx += fdx;
1000	fy += fdy;
1001	++b;
1002	}
1003	}
1004
1005	static inline __m256i epilogueMaskFromCount(qsizetype count)
1006	{
1007	Q_ASSERT(count > `0`);
1008	static const __m256i offsetMask = _mm256_setr_epi32(`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`);
1009	return _mm256_add_epi32(offsetMask, _mm256_set1_epi32(-count));
1010	}
1011
1012	template<bool RGBA>
1013	static void convertARGBToARGB32PM_avx2(uint buffer, const* uint *src, qsizetype count)
1014	{
1015	qsizetype i = `0`;
1016	const __m256i alphaMask = _mm256_set1_epi32(`0xff000000`);
1017	const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(`2`, `1`, `0`, `3`, `6`, `5`, `4`, `7`, `10`, `9`, `8`, `11`, `14`, `13`, `12`, `15`));
1018	const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(`6`, `7`, `6`, `7`, `6`, `7`, `6`, `7`, `14`, `15`, `14`, `15`, `14`, `15`, `14`, `15`));
1019	const __m256i half = _mm256_set1_epi16(`0x0080`);
1020	const __m256i zero = _mm256_setzero_si256();
1021
1022	for (; i < count - `7`; i += `8`) {
1023	__m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
1024	if (!_mm256_testz_si256(srcVector, alphaMask)) {
1025	// keep the two _mm_test[zc]_siXXX next to each other
1026	bool cf = _mm256_testc_si256(srcVector, alphaMask);
1027	if (RGBA)
1028	srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1029	if (!cf) {
1030	__m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
1031	__m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
1032	__m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1033	__m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1034	src1 = _mm256_mullo_epi16(src1, alpha1);
1035	src2 = _mm256_mullo_epi16(src2, alpha2);
1036	src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, `8`));
1037	src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, `8`));
1038	src1 = _mm256_add_epi16(src1, half);
1039	src2 = _mm256_add_epi16(src2, half);
1040	src1 = _mm256_srli_epi16(src1, `8`);
1041	src2 = _mm256_srli_epi16(src2, `8`);
1042	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1043	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1044	srcVector = _mm256_packus_epi16(src1, src2);
1045	_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
1046	} else {
1047	if (buffer != src \|\| RGBA)
1048	_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
1049	}
1050	} else {
1051	_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), zero);
1052	}
1053	}
1054
1055	if (i < count) {
1056	const __m256i epilogueMask = epilogueMaskFromCount(count - i);
1057	__m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
1058	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1059
1060	if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1061	// keep the two _mm_test[zc]_siXXX next to each other
1062	bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1063	if (RGBA)
1064	srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1065	if (!cf) {
1066	__m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
1067	__m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
1068	__m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1069	__m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1070	src1 = _mm256_mullo_epi16(src1, alpha1);
1071	src2 = _mm256_mullo_epi16(src2, alpha2);
1072	src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, `8`));
1073	src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, `8`));
1074	src1 = _mm256_add_epi16(src1, half);
1075	src2 = _mm256_add_epi16(src2, half);
1076	src1 = _mm256_srli_epi16(src1, `8`);
1077	src2 = _mm256_srli_epi16(src2, `8`);
1078	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1079	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1080	srcVector = _mm256_packus_epi16(src1, src2);
1081	_mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
1082	} else {
1083	if (buffer != src \|\| RGBA)
1084	_mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
1085	}
1086	} else {
1087	_mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, zero);
1088	}
1089	}
1090	}
1091
1092	void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint buffer, int* count, const QList<QRgb> *)
1093	{
1094	convertARGBToARGB32PM_avx2<false>(buffer, buffer, count);
1095	}
1096
1097	void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint buffer, int* count, const QList<QRgb> *)
1098	{
1099	convertARGBToARGB32PM_avx2<true>(buffer, buffer, count);
1100	}
1101
1102	const uint QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1103	const QList<QRgb> , QDitherInfo )
1104	{
1105	convertARGBToARGB32PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1106	return buffer;
1107	}
1108
1109	const uint QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1110	const QList<QRgb> , QDitherInfo )
1111	{
1112	convertARGBToARGB32PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1113	return buffer;
1114	}
1115
1116	template<bool RGBA>
1117	static void convertARGBToRGBA64PM_avx2(QRgba64 buffer, const* uint *src, qsizetype count)
1118	{
1119	qsizetype i = `0`;
1120	const __m256i alphaMask = _mm256_set1_epi32(`0xff000000`);
1121	const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(`2`, `1`, `0`, `3`, `6`, `5`, `4`, `7`, `10`, `9`, `8`, `11`, `14`, `13`, `12`, `15`));
1122	const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(`6`, `7`, `6`, `7`, `6`, `7`, `6`, `7`, `14`, `15`, `14`, `15`, `14`, `15`, `14`, `15`));
1123	const __m256i zero = _mm256_setzero_si256();
1124
1125	for (; i < count - `7`; i += `8`) {
1126	__m256i dst1, dst2;
1127	__m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
1128	if (!_mm256_testz_si256(srcVector, alphaMask)) {
1129	// keep the two _mm_test[zc]_siXXX next to each other
1130	bool cf = _mm256_testc_si256(srcVector, alphaMask);
1131	if (!RGBA)
1132	srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1133
1134	// The two unpack instructions unpack the low and upper halves of
1135	// each 128-bit half of the 256-bit register. Here's the tracking
1136	// of what's where: (p is 32-bit, P is 64-bit)
1137	// as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1138	// after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1139	// after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1140	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1141
1142	const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1143	const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1144	if (!cf) {
1145	const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1146	const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1147	dst1 = _mm256_mulhi_epu16(src1, alpha1);
1148	dst2 = _mm256_mulhi_epu16(src2, alpha2);
1149	dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, `15`));
1150	dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, `15`));
1151	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1152	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1153	} else {
1154	dst1 = src1;
1155	dst2 = src2;
1156	}
1157	} else {
1158	dst1 = dst2 = zero;
1159	}
1160	_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), dst1);
1161	_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + `1`, dst2);
1162	}
1163
1164	if (i < count) {
1165	__m256i epilogueMask = epilogueMaskFromCount(count - i);
1166	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1167	__m256i dst1, dst2;
1168	__m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
1169
1170	if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1171	// keep the two _mm_test[zc]_siXXX next to each other
1172	bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1173	if (!RGBA)
1174	srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1175	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1176	const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1177	const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1178	if (!cf) {
1179	const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1180	const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1181	dst1 = _mm256_mulhi_epu16(src1, alpha1);
1182	dst2 = _mm256_mulhi_epu16(src2, alpha2);
1183	dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, `15`));
1184	dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, `15`));
1185	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1186	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1187	} else {
1188	dst1 = src1;
1189	dst2 = src2;
1190	}
1191	} else {
1192	dst1 = dst2 = zero;
1193	}
1194	epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1195	_mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i),
1196	_mm256_unpacklo_epi32(epilogueMask, epilogueMask),
1197	dst1);
1198	_mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i + `4`),
1199	_mm256_unpackhi_epi32(epilogueMask, epilogueMask),
1200	dst2);
1201	}
1202	}
1203
1204	const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1205	const QList<QRgb> , QDitherInfo )
1206	{
1207	convertARGBToRGBA64PM_avx2<false>(buffer, src, count);
1208	return buffer;
1209	}
1210
1211	const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1212	const QList<QRgb> , QDitherInfo )
1213	{
1214	convertARGBToRGBA64PM_avx2<true>(buffer, src, count);
1215	return buffer;
1216	}
1217
1218	const QRgba64 QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1219	const QList<QRgb> , QDitherInfo )
1220	{
1221	convertARGBToRGBA64PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1222	return buffer;
1223	}
1224
1225	const QRgba64 QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1226	const QList<QRgb> , QDitherInfo )
1227	{
1228	convertARGBToRGBA64PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1229	return buffer;
1230	}
1231
1232	QT_END_NAMESPACE
1233
1234	#endif
1235

Browse the source code of Qt/src/gui/painting/qdrawhelper_avx2.cpp