1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Copyright (C) 2018 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the QtCore module of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:LGPL$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU Lesser General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
22 | ** packaging of this file. Please review the following information to |
23 | ** ensure the GNU Lesser General Public License version 3 requirements |
24 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
25 | ** |
26 | ** GNU General Public License Usage |
27 | ** Alternatively, this file may be used under the terms of the GNU |
28 | ** General Public License version 2.0 or (at your option) the GNU General |
29 | ** Public license version 3 or any later version approved by the KDE Free |
30 | ** Qt Foundation. The licenses are as published by the Free Software |
31 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
32 | ** included in the packaging of this file. Please review the following |
33 | ** information to ensure the GNU General Public License requirements will |
34 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
35 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
36 | ** |
37 | ** $QT_END_LICENSE$ |
38 | ** |
39 | ****************************************************************************/ |
40 | |
41 | #ifndef QSIMD_P_H |
42 | #define QSIMD_P_H |
43 | |
44 | // |
45 | // W A R N I N G |
46 | // ------------- |
47 | // |
48 | // This file is not part of the Qt API. It exists purely as an |
49 | // implementation detail. This header file may change from version to |
50 | // version without notice, or even be removed. |
51 | // |
52 | // We mean it. |
53 | // |
54 | |
55 | #include <QtCore/private/qglobal_p.h> |
56 | #include <QtCore/qsimd.h> |
57 | |
58 | /* |
59 | * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. |
60 | * They mean the compiler supports the necessary flags and the headers |
61 | * for the x86 and ARM intrinsics: |
62 | * - GCC: the -mXXX or march=YYY flag is necessary before #include |
63 | * up to 4.8; GCC >= 4.9 can include unconditionally |
64 | * - Intel CC: #include can happen unconditionally |
65 | * - MSVC: #include can happen unconditionally |
66 | * - RVCT: ??? |
67 | * |
68 | * We will try to include all headers possible under this configuration. |
69 | * |
70 | * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & |
71 | * up do define __AVX__ if the -arch:AVX option is passed on the command-line. |
72 | * |
73 | * Supported XXX are: |
74 | * Flag | Arch | GCC | Intel CC | MSVC | |
75 | * ARM_NEON | ARM | I & C | None | ? | |
76 | * SSE2 | x86 | I & C | I & C | I & C | |
77 | * SSE3 | x86 | I & C | I & C | I only | |
78 | * SSSE3 | x86 | I & C | I & C | I only | |
79 | * SSE4_1 | x86 | I & C | I & C | I only | |
80 | * SSE4_2 | x86 | I & C | I & C | I only | |
81 | * AVX | x86 | I & C | I & C | I & C | |
82 | * AVX2 | x86 | I & C | I & C | I only | |
83 | * AVX512xx | x86 | I & C | I & C | I only | |
84 | * I = intrinsics; C = code generation |
85 | * |
86 | * Code can use the following constructs to determine compiler support & status: |
87 | * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) |
88 | * If this test passes, then the compiler is already generating code for that |
89 | * given sub-architecture. The intrinsics for that sub-architecture are |
90 | * #included and can be used without restriction or runtime check. |
91 | * |
92 | * - #if QT_COMPILER_SUPPORTS(XXX) |
93 | * If this test passes, then the compiler is able to generate code for that |
94 | * given sub-architecture in another translation unit, given the right set of |
95 | * flags. Use of the intrinsics is not guaranteed. This is useful with |
96 | * runtime detection (see below). |
97 | * |
98 | * - #if QT_COMPILER_SUPPORTS_HERE(XXX) |
99 | * If this test passes, then the compiler is able to generate code for that |
100 | * given sub-architecture in this translation unit, even if it is not doing |
101 | * that now (it might be). Individual functions may be tagged with |
102 | * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that |
103 | * sub-arch. Only inside such functions is the use of the intrisics |
104 | * guaranteed to work. This is useful with runtime detection (see below). |
105 | * |
106 | * Runtime detection of a CPU sub-architecture can be done with the |
107 | * qCpuHasFeature(XXX) function. There are two strategies for generating |
108 | * optimized code like that: |
109 | * |
110 | * 1) place the optimized code in a different translation unit (C or assembly |
111 | * sources) and pass the correct flags to the compiler to enable support. Those |
112 | * sources must not include qglobal.h, which means they cannot include this |
113 | * file either. The dispatcher function would look like this: |
114 | * |
115 | * void foo() |
116 | * { |
117 | * #if QT_COMPILER_SUPPORTS(XXX) |
118 | * if (qCpuHasFeature(XXX)) { |
119 | * foo_optimized_xxx(); |
120 | * return; |
121 | * } |
122 | * #endif |
123 | * foo_plain(); |
124 | * } |
125 | * |
126 | * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and |
127 | * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use |
128 | * other Qt code. The dispatcher function would look like this: |
129 | * |
130 | * void foo() |
131 | * { |
132 | * #if QT_COMPILER_SUPPORTS_HERE(XXX) |
133 | * if (qCpuHasFeature(XXX)) { |
134 | * foo_optimized_xxx(); |
135 | * return; |
136 | * } |
137 | * #endif |
138 | * foo_plain(); |
139 | * } |
140 | */ |
141 | |
142 | #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC) |
143 | #include <intrin.h> |
144 | #endif |
145 | |
146 | #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) |
147 | |
148 | #if defined(Q_PROCESSOR_ARM) |
149 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x) |
150 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600 |
151 | /* GCC requires attributes for a function */ |
152 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
153 | # else |
154 | # define QT_FUNCTION_TARGET(x) |
155 | # endif |
156 | # if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__) |
157 | # define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON) |
158 | # endif |
159 | #elif defined(Q_PROCESSOR_MIPS) |
160 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
161 | # define QT_FUNCTION_TARGET(x) |
162 | # if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32) |
163 | # define __MIPS_DSP__ |
164 | # endif |
165 | # if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32) |
166 | # define __MIPS_DSPR2__ |
167 | # endif |
168 | #elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) |
169 | # define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) |
170 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) |
171 | /* GCC requires attributes for a function */ |
172 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
173 | # else |
174 | # define QT_FUNCTION_TARGET(x) |
175 | # endif |
176 | #else |
177 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
178 | # define QT_FUNCTION_TARGET(x) |
179 | #endif |
180 | |
181 | #ifdef Q_PROCESSOR_X86 |
182 | /* -- x86 intrinsic support -- */ |
183 | |
184 | # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) |
185 | // MSVC doesn't define __SSE2__, so do it ourselves |
186 | # define __SSE__ 1 |
187 | # endif |
188 | |
189 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) |
190 | // GCC 4.4 and Clang 2.8 added a few more intrinsics there |
191 | # include <x86intrin.h> |
192 | # endif |
193 | |
194 | # if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
195 | // POPCNT instructions: |
196 | // All processors that support SSE4.2 support POPCNT |
197 | // (but neither MSVC nor the Intel compiler define this macro) |
198 | # define __POPCNT__ 1 |
199 | # endif |
200 | |
201 | // AVX intrinsics |
202 | # if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
203 | // AES, PCLMULQDQ instructions: |
204 | // All processors that support AVX support PCLMULQDQ |
205 | // (but neither MSVC nor the Intel compiler define this macro) |
206 | # define __PCLMUL__ 1 |
207 | # endif |
208 | |
209 | # if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
210 | // F16C & RDRAND instructions: |
211 | // All processors that support AVX2 support F16C & RDRAND: |
212 | // (but neither MSVC nor the Intel compiler define these macros) |
213 | # define __F16C__ 1 |
214 | # define __RDRND__ 1 |
215 | # endif |
216 | |
217 | # if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL) |
218 | // BMI2 instructions: |
219 | // All processors that support BMI support BMI2 (and AVX2) |
220 | // (but neither MSVC nor the Intel compiler define this macro) |
221 | # define __BMI2__ 1 |
222 | # endif |
223 | |
224 | # include "qsimd_x86_p.h" |
225 | |
226 | // Haswell sub-architecture |
227 | // |
228 | // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, |
229 | // BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a |
230 | // sub-target for us. The first AMD processor with AVX2 support (Zen) has the |
231 | // same features. |
232 | // |
233 | // macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc |
234 | // ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). |
235 | # define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell" |
236 | # if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \ |
237 | defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__) |
238 | # define __haswell__ 1 |
239 | # endif |
240 | |
241 | // This constant does not include all CPU features found in a Haswell, only |
242 | // those that we'd have optimized code for. |
243 | // Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode. |
244 | QT_BEGIN_NAMESPACE |
245 | static const quint64 CpuFeatureArchHaswell = 0 |
246 | | CpuFeatureSSE2 |
247 | | CpuFeatureSSE3 |
248 | | CpuFeatureSSSE3 |
249 | | CpuFeatureSSE4_1 |
250 | | CpuFeatureSSE4_2 |
251 | | CpuFeatureFMA |
252 | | CpuFeaturePOPCNT |
253 | | CpuFeatureAVX |
254 | | CpuFeatureF16C |
255 | | CpuFeatureAVX2 |
256 | | CpuFeatureBMI |
257 | | CpuFeatureBMI2; |
258 | QT_END_NAMESPACE |
259 | |
260 | #endif /* Q_PROCESSOR_X86 */ |
261 | |
262 | // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html |
263 | // This should be tweaked with an "upper version" of clang once we know which release fixes the |
264 | // issue. At that point we can rely on __ARM_FEATURE_CRC32 again. |
265 | #if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32) |
266 | # undef __ARM_FEATURE_CRC32 |
267 | #endif |
268 | |
269 | // NEON intrinsics |
270 | // note: as of GCC 4.9, does not support function targets for ARM |
271 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) |
272 | #define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. |
273 | #ifndef __ARM_NEON__ |
274 | // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. |
275 | #define __ARM_NEON__ |
276 | #endif |
277 | #endif |
278 | // AArch64/ARM64 |
279 | #if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32) |
280 | #if defined(Q_PROCESSOR_ARM_64) |
281 | // only available on aarch64 |
282 | #define QT_FUNCTION_TARGET_STRING_CRC32 "+crc" |
283 | #endif |
284 | # include <arm_acle.h> |
285 | #endif |
286 | |
287 | #ifdef __cplusplus |
288 | #include <qatomic.h> |
289 | |
290 | QT_BEGIN_NAMESPACE |
291 | |
292 | #ifndef Q_PROCESSOR_X86 |
293 | enum CPUFeatures { |
294 | #if defined(Q_PROCESSOR_ARM) |
295 | CpuFeatureNEON = 2, |
296 | CpuFeatureARM_NEON = CpuFeatureNEON, |
297 | CpuFeatureCRC32 = 4, |
298 | #elif defined(Q_PROCESSOR_MIPS) |
299 | CpuFeatureDSP = 2, |
300 | CpuFeatureDSPR2 = 4, |
301 | #endif |
302 | |
303 | // used only to indicate that the CPU detection was initialised |
304 | QSimdInitialized = 1 |
305 | }; |
306 | |
307 | static const quint64 qCompilerCpuFeatures = 0 |
308 | #if defined __ARM_NEON__ |
309 | | CpuFeatureNEON |
310 | #endif |
311 | #if defined __ARM_FEATURE_CRC32 |
312 | | CpuFeatureCRC32 |
313 | #endif |
314 | #if defined __mips_dsp |
315 | | CpuFeatureDSP |
316 | #endif |
317 | #if defined __mips_dspr2 |
318 | | CpuFeatureDSPR2 |
319 | #endif |
320 | ; |
321 | #endif |
322 | |
323 | #ifdef Q_ATOMIC_INT64_IS_SUPPORTED |
324 | extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1]; |
325 | #else |
326 | extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2]; |
327 | #endif |
328 | Q_CORE_EXPORT quint64 qDetectCpuFeatures(); |
329 | |
330 | #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED) |
331 | Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept; |
332 | #else |
333 | static inline qsizetype qRandomCpu(void *, qsizetype) noexcept |
334 | { |
335 | return 0; |
336 | } |
337 | #endif |
338 | |
339 | static inline quint64 qCpuFeatures() |
340 | { |
341 | quint64 features = qt_cpu_features[0].loadRelaxed(); |
342 | #ifndef Q_ATOMIC_INT64_IS_SUPPORTED |
343 | features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32; |
344 | #endif |
345 | if (Q_UNLIKELY(features == 0)) { |
346 | features = qDetectCpuFeatures(); |
347 | Q_ASSUME(features != 0); |
348 | } |
349 | return features; |
350 | } |
351 | |
352 | #define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ |
353 | || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) |
354 | |
355 | inline bool qHasHwrng() |
356 | { |
357 | #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) |
358 | return qCpuHasFeature(RDRND); |
359 | #else |
360 | return false; |
361 | #endif |
362 | } |
363 | |
364 | #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ |
365 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |
366 | |
367 | #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \ |
368 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i) |
369 | |
370 | QT_END_NAMESPACE |
371 | |
372 | #endif // __cplusplus |
373 | |
374 | #define SIMD_EPILOGUE(i, length, max) \ |
375 | for (int _i = 0; _i < max && i < length; ++i, ++_i) |
376 | |
377 | #endif // QSIMD_P_H |
378 | |