1/****************************************************************************
2**
3** Copyright (C) 2018 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtGui module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#include <private/qdrawhelper_x86_p.h>
42
43#if defined(QT_COMPILER_SUPPORTS_SSSE3)
44
45#include <private/qdrawingprimitive_sse2_p.h>
46
47QT_BEGIN_NAMESPACE
48
49/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
50 shift (4, 8, 12). Checking the alignment inside the loop is unfortunatelly way too slow.
51 */
52#define BLENDING_LOOP(palignrOffset, length)\
53 for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
54 const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
55 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
56 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
57 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
58 _mm_store_si128((__m128i *)&dst[x], srcVector); \
59 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
60 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
61 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
62 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
63 __m128i destMultipliedByOneMinusAlpha; \
64 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
65 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
66 _mm_store_si128((__m128i *)&dst[x], result); \
67 } \
68 srcVectorPrevLoaded = srcVectorLastLoaded;\
69 }
70
71
72// Basically blend src over dst with the const alpha defined as constAlphaVector.
73// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
74//const __m128i nullVector = _mm_set1_epi32(0);
75//const __m128i half = _mm_set1_epi16(0x80);
76//const __m128i one = _mm_set1_epi16(0xff);
77//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
78//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
79//
80// The computation being done is:
81// result = s + d * (1-alpha)
82// with shortcuts if fully opaque or fully transparent.
83static inline void Q_DECL_VECTORCALL
84BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length,
85 __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
86{
87 int x = 0;
88
89 /* First, get dst aligned. */
90 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
91 blend_pixel(dst[x], src[x]);
92 }
93
94 const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
95
96 if (!minusOffsetToAlignSrcOn16Bytes) {
97 /* src is aligned, usual algorithm but with aligned operations.
98 See the SSE2 version for more documentation on the algorithm itself. */
99 const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
100 for (; x < length-3; x += 4) {
101 const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]);
102 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
103 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
104 _mm_store_si128((__m128i *)&dst[x], srcVector);
105 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
106 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
107 alphaChannel = _mm_sub_epi16(one, alphaChannel);
108 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
109 __m128i destMultipliedByOneMinusAlpha;
110 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
111 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
112 _mm_store_si128((__m128i *)&dst[x], result);
113 }
114 } /* end for() */
115 } else if ((length - x) >= 8) {
116 /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
117 __m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
118 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
119
120 const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
121 switch (palignrOffset) {
122 case 4:
123 BLENDING_LOOP(4, length)
124 break;
125 case 8:
126 BLENDING_LOOP(8, length)
127 break;
128 case 12:
129 BLENDING_LOOP(12, length)
130 break;
131 }
132 }
133 for (; x < length; ++x)
134 blend_pixel(dst[x], src[x]);
135}
136
137void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
138 const uchar *srcPixels, int sbpl,
139 int w, int h,
140 int const_alpha)
141{
142 const quint32 *src = (const quint32 *) srcPixels;
143 quint32 *dst = (quint32 *) destPixels;
144 if (const_alpha == 256) {
145 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
146 const __m128i nullVector = _mm_setzero_si128();
147 const __m128i half = _mm_set1_epi16(0x80);
148 const __m128i one = _mm_set1_epi16(0xff);
149 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
150
151 for (int y = 0; y < h; ++y) {
152 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
153 dst = (quint32 *)(((uchar *) dst) + dbpl);
154 src = (const quint32 *)(((const uchar *) src) + sbpl);
155 }
156 } else if (const_alpha != 0) {
157 // dest = (s + d * sia) * ca + d * cia
158 // = s * ca + d * (sia * ca + cia)
159 // = s * ca + d * (1 - sa*ca)
160 const_alpha = (const_alpha * 255) >> 8;
161 const __m128i nullVector = _mm_setzero_si128();
162 const __m128i half = _mm_set1_epi16(0x80);
163 const __m128i one = _mm_set1_epi16(0xff);
164 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
165 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
166 for (int y = 0; y < h; ++y) {
167 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
168 dst = (quint32 *)(((uchar *) dst) + dbpl);
169 src = (const quint32 *)(((const uchar *) src) + sbpl);
170 }
171 }
172}
173
174const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count)
175{
176 const quint24 *s = reinterpret_cast<const quint24 *>(src);
177 for (int i = 0; i < count; ++i)
178 buffer[i] = s[index + i];
179 return buffer;
180}
181
182extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len);
183
184const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data,
185 int y, int x, int length)
186{
187 const uchar *line = data->texture.scanLine(y) + x * 3;
188 qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
189 return buffer;
190}
191
192void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
193{
194 // LCM of 12 and 16 bytes is 48 bytes (16 px)
195 quint32 v = color;
196 __m128i m = _mm_cvtsi32_si128(v);
197 quint24 *end = dest + count;
198
199 constexpr uchar x = 2, y = 1, z = 0;
200 alignas(__m128i) static const uchar
201 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
202
203 __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask)));
204 __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1)));
205 __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
206
207 for ( ; dest + 16 <= end; dest += 16) {
208#ifdef __AVX__
209 // Store using 32-byte AVX instruction
210 __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
211 mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
212 _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
213#else
214 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1);
215 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2);
216#endif
217 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3);
218 }
219
220 if (count < 3) {
221 if (count > 1)
222 end[-2] = v;
223 if (count)
224 end[-1] = v;
225 return;
226 }
227
228 // less than 16px/48B left
229 uchar *ptr = reinterpret_cast<uchar *>(dest);
230 uchar *ptr_end = reinterpret_cast<uchar *>(end);
231 qptrdiff left = ptr_end - ptr;
232 if (left >= 24) {
233 // 8px/24B or more left
234 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1);
235 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2);
236 ptr += 24;
237 left -= 24;
238 }
239
240 // less than 8px/24B left
241
242 if (left >= 16) {
243 // but more than 5px/15B left
244 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1);
245 } else if (left >= 8) {
246 // but more than 2px/6B left
247 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1);
248 }
249
250 if (left) {
251 // 1 or 2px left
252 // store 8 bytes ending with the right values (will overwrite a bit)
253 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
254 }
255}
256
257void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count)
258{
259 int i = 0;
260
261 const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /*!!*/15);
262 const static __m128i shuffleMask2 = _mm_setr_epi8(0, /*!!*/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /*!!*/14, 15);
263 const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
264
265 for (; i + 15 < count; i += 16) {
266 __m128i s1 = _mm_loadu_si128((const __m128i *)src);
267 __m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16));
268 __m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32));
269 s1 = _mm_shuffle_epi8(s1, shuffleMask1);
270 s2 = _mm_shuffle_epi8(s2, shuffleMask2);
271 s3 = _mm_shuffle_epi8(s3, shuffleMask3);
272 _mm_storeu_si128((__m128i *)dst, s1);
273 _mm_storeu_si128((__m128i *)(dst + 16), s2);
274 _mm_storeu_si128((__m128i *)(dst + 32), s3);
275
276 // Now fix the last four misplaced values
277 std::swap(dst[15], dst[17]);
278 std::swap(dst[30], dst[32]);
279
280 src += 48;
281 dst += 48;
282 }
283
284 if (src != dst) {
285 SIMD_EPILOGUE(i, count, 15) {
286 dst[0] = src[2];
287 dst[1] = src[1];
288 dst[2] = src[0];
289 dst += 3;
290 src += 3;
291 }
292 } else {
293 SIMD_EPILOGUE(i, count, 15) {
294 std::swap(dst[0], dst[2]);
295 dst += 3;
296 }
297 }
298}
299
300QT_END_NAMESPACE
301
302#endif // QT_COMPILER_SUPPORTS_SSSE3
303