1 | /******************************************************************************* |
2 | * Copyright 2016-2019 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /******************************************************************************* |
18 | * Copyright (c) 2007 MITSUNARI Shigeo |
19 | * All rights reserved. |
20 | * |
21 | * Redistribution and use in source and binary forms, with or without |
22 | * modification, are permitted provided that the following conditions are met: |
23 | * |
24 | * Redistributions of source code must retain the above copyright notice, this |
25 | * list of conditions and the following disclaimer. |
26 | * Redistributions in binary form must reproduce the above copyright notice, |
27 | * this list of conditions and the following disclaimer in the documentation |
28 | * and/or other materials provided with the distribution. |
29 | * Neither the name of the copyright owner nor the names of its contributors may |
30 | * be used to endorse or promote products derived from this software without |
31 | * specific prior written permission. |
32 | * |
33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
34 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
35 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
36 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
37 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
38 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
39 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
40 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
41 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
42 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
43 | * THE POSSIBILITY OF SUCH DAMAGE. |
44 | *******************************************************************************/ |
45 | |
46 | #ifndef XBYAK_XBYAK_UTIL_H_ |
47 | #define XBYAK_XBYAK_UTIL_H_ |
48 | |
49 | /** |
50 | utility class and functions for Xbyak |
51 | Xbyak::util::Clock ; rdtsc timer |
52 | Xbyak::util::Cpu ; detect CPU |
53 | @note this header is UNDER CONSTRUCTION! |
54 | */ |
55 | #include "xbyak.h" |
56 | |
57 | #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) |
58 | #define XBYAK_INTEL_CPU_SPECIFIC |
59 | #endif |
60 | |
61 | #ifdef XBYAK_INTEL_CPU_SPECIFIC |
62 | #ifdef _MSC_VER |
63 | #if (_MSC_VER < 1400) && defined(XBYAK32) |
64 | static inline __declspec(naked) void __cpuid(int[4], int) |
65 | { |
66 | __asm { |
67 | push ebx |
68 | push esi |
69 | mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn |
70 | cpuid |
71 | mov esi, dword ptr [esp + 4 * 2 + 4] // data |
72 | mov dword ptr [esi], eax |
73 | mov dword ptr [esi + 4], ebx |
74 | mov dword ptr [esi + 8], ecx |
75 | mov dword ptr [esi + 12], edx |
76 | pop esi |
77 | pop ebx |
78 | ret |
79 | } |
80 | } |
81 | #else |
82 | #include <intrin.h> // for __cpuid |
83 | #endif |
84 | #else |
85 | #ifndef __GNUC_PREREQ |
86 | #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) |
87 | #endif |
88 | #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__) |
89 | #include <cpuid.h> |
90 | #else |
91 | #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm' |
92 | #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) |
93 | #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) |
94 | #else |
95 | #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) |
96 | #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) |
97 | #endif |
98 | #endif |
99 | #endif |
100 | #endif |
101 | |
102 | namespace Xbyak { namespace util { |
103 | |
104 | typedef enum { |
105 | SmtLevel = 1, |
106 | CoreLevel = 2 |
107 | } IntelCpuTopologyLevel; |
108 | |
109 | /** |
110 | CPU detection class |
111 | */ |
112 | class Cpu { |
113 | uint64 type_; |
114 | //system topology |
115 | bool x2APIC_supported_; |
116 | static const size_t maxTopologyLevels = 2; |
117 | unsigned int numCores_[maxTopologyLevels]; |
118 | |
119 | static const unsigned int maxNumberCacheLevels = 10; |
120 | unsigned int dataCacheSize_[maxNumberCacheLevels]; |
121 | unsigned int coresSharignDataCache_[maxNumberCacheLevels]; |
122 | unsigned int dataCacheLevels_; |
123 | |
124 | unsigned int get32bitAsBE(const char *x) const |
125 | { |
126 | return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); |
127 | } |
128 | unsigned int mask(int n) const |
129 | { |
130 | return (1U << n) - 1; |
131 | } |
132 | void setFamily() |
133 | { |
134 | unsigned int data[4] = {}; |
135 | getCpuid(1, data); |
136 | stepping = data[0] & mask(4); |
137 | model = (data[0] >> 4) & mask(4); |
138 | family = (data[0] >> 8) & mask(4); |
139 | // type = (data[0] >> 12) & mask(2); |
140 | extModel = (data[0] >> 16) & mask(4); |
141 | extFamily = (data[0] >> 20) & mask(8); |
142 | if (family == 0x0f) { |
143 | displayFamily = family + extFamily; |
144 | } else { |
145 | displayFamily = family; |
146 | } |
147 | if (family == 6 || family == 0x0f) { |
148 | displayModel = (extModel << 4) + model; |
149 | } else { |
150 | displayModel = model; |
151 | } |
152 | } |
153 | unsigned int (unsigned int val, unsigned int base, unsigned int end) |
154 | { |
155 | return (val >> base) & ((1u << (end - base)) - 1); |
156 | } |
157 | void setNumCores() |
158 | { |
159 | if ((type_ & tINTEL) == 0) return; |
160 | |
161 | unsigned int data[4] = {}; |
162 | |
163 | /* CAUTION: These numbers are configuration as shipped by Intel. */ |
164 | getCpuidEx(0x0, 0, data); |
165 | if (data[0] >= 0xB) { |
166 | /* |
167 | if leaf 11 exists(x2APIC is supported), |
168 | we use it to get the number of smt cores and cores on socket |
169 | |
170 | leaf 0xB can be zeroed-out by a hypervisor |
171 | */ |
172 | x2APIC_supported_ = true; |
173 | for (unsigned int i = 0; i < maxTopologyLevels; i++) { |
174 | getCpuidEx(0xB, i, data); |
175 | IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); |
176 | if (level == SmtLevel || level == CoreLevel) { |
177 | numCores_[level - 1] = extractBit(data[1], 0, 15); |
178 | } |
179 | } |
180 | } else { |
181 | /* |
182 | Failed to deremine num of cores without x2APIC support. |
183 | TODO: USE initial APIC ID to determine ncores. |
184 | */ |
185 | numCores_[SmtLevel - 1] = 0; |
186 | numCores_[CoreLevel - 1] = 0; |
187 | } |
188 | |
189 | } |
190 | void setCacheHierarchy() |
191 | { |
192 | if ((type_ & tINTEL) == 0) return; |
193 | const unsigned int NO_CACHE = 0; |
194 | const unsigned int DATA_CACHE = 1; |
195 | // const unsigned int INSTRUCTION_CACHE = 2; |
196 | const unsigned int UNIFIED_CACHE = 3; |
197 | unsigned int smt_width = 0; |
198 | unsigned int logical_cores = 0; |
199 | unsigned int data[4] = {}; |
200 | |
201 | if (x2APIC_supported_) { |
202 | smt_width = numCores_[0]; |
203 | logical_cores = numCores_[1]; |
204 | } |
205 | |
206 | /* |
207 | Assumptions: |
208 | the first level of data cache is not shared (which is the |
209 | case for every existing architecture) and use this to |
210 | determine the SMT width for arch not supporting leaf 11. |
211 | when leaf 4 reports a number of core less than numCores_ |
212 | on socket reported by leaf 11, then it is a correct number |
213 | of cores not an upperbound. |
214 | */ |
215 | for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { |
216 | getCpuidEx(0x4, i, data); |
217 | unsigned int cacheType = extractBit(data[0], 0, 4); |
218 | if (cacheType == NO_CACHE) break; |
219 | if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { |
220 | unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; |
221 | if (logical_cores != 0) { // true only if leaf 0xB is supported and valid |
222 | actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); |
223 | } |
224 | assert(actual_logical_cores != 0); |
225 | dataCacheSize_[dataCacheLevels_] = |
226 | (extractBit(data[1], 22, 31) + 1) |
227 | * (extractBit(data[1], 12, 21) + 1) |
228 | * (extractBit(data[1], 0, 11) + 1) |
229 | * (data[2] + 1); |
230 | if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; |
231 | assert(smt_width != 0); |
232 | // FIXME: check and fix number of cores sharing L3 cache for different configurations |
233 | // (HT-, 2 sockets), (HT-, 1 socket), (HT+, 2 sockets), (HT+, 1 socket) |
234 | coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); |
235 | dataCacheLevels_++; |
236 | } |
237 | } |
238 | } |
239 | |
240 | public: |
241 | int model; |
242 | int family; |
243 | int stepping; |
244 | int extModel; |
245 | int extFamily; |
246 | int displayFamily; // family + extFamily |
247 | int displayModel; // model + extModel |
248 | |
249 | unsigned int getNumCores(IntelCpuTopologyLevel level) { |
250 | if (level != SmtLevel && level != CoreLevel) throw Error(ERR_BAD_PARAMETER); |
251 | if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); |
252 | return (level == CoreLevel) |
253 | ? numCores_[level - 1] / numCores_[SmtLevel - 1] |
254 | : numCores_[level - 1]; |
255 | } |
256 | |
257 | unsigned int getDataCacheLevels() const { return dataCacheLevels_; } |
258 | unsigned int getCoresSharingDataCache(unsigned int i) const |
259 | { |
260 | if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); |
261 | return coresSharignDataCache_[i]; |
262 | } |
263 | unsigned int getDataCacheSize(unsigned int i) const |
264 | { |
265 | if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); |
266 | return dataCacheSize_[i]; |
267 | } |
268 | |
269 | /* |
270 | data[] = { eax, ebx, ecx, edx } |
271 | */ |
272 | static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) |
273 | { |
274 | #ifdef XBYAK_INTEL_CPU_SPECIFIC |
275 | #ifdef _MSC_VER |
276 | __cpuid(reinterpret_cast<int*>(data), eaxIn); |
277 | #else |
278 | __cpuid(eaxIn, data[0], data[1], data[2], data[3]); |
279 | #endif |
280 | #else |
281 | (void)eaxIn; |
282 | (void)data; |
283 | #endif |
284 | } |
285 | static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) |
286 | { |
287 | #ifdef XBYAK_INTEL_CPU_SPECIFIC |
288 | #ifdef _MSC_VER |
289 | __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); |
290 | #else |
291 | __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); |
292 | #endif |
293 | #else |
294 | (void)eaxIn; |
295 | (void)ecxIn; |
296 | (void)data; |
297 | #endif |
298 | } |
299 | static inline uint64 getXfeature() |
300 | { |
301 | #ifdef XBYAK_INTEL_CPU_SPECIFIC |
302 | #ifdef _MSC_VER |
303 | return _xgetbv(0); |
304 | #else |
305 | unsigned int eax, edx; |
306 | // xgetvb is not support on gcc 4.2 |
307 | // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); |
308 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a" (eax), "=d" (edx) : "c" (0)); |
309 | return ((uint64)edx << 32) | eax; |
310 | #endif |
311 | #else |
312 | return 0; |
313 | #endif |
314 | } |
315 | typedef uint64 Type; |
316 | |
317 | static const Type NONE = 0; |
318 | static const Type tMMX = 1 << 0; |
319 | static const Type tMMX2 = 1 << 1; |
320 | static const Type tCMOV = 1 << 2; |
321 | static const Type tSSE = 1 << 3; |
322 | static const Type tSSE2 = 1 << 4; |
323 | static const Type tSSE3 = 1 << 5; |
324 | static const Type tSSSE3 = 1 << 6; |
325 | static const Type tSSE41 = 1 << 7; |
326 | static const Type tSSE42 = 1 << 8; |
327 | static const Type tPOPCNT = 1 << 9; |
328 | static const Type tAESNI = 1 << 10; |
329 | static const Type tSSE5 = 1 << 11; |
330 | static const Type tOSXSAVE = 1 << 12; |
331 | static const Type tPCLMULQDQ = 1 << 13; |
332 | static const Type tAVX = 1 << 14; |
333 | static const Type tFMA = 1 << 15; |
334 | |
335 | static const Type t3DN = 1 << 16; |
336 | static const Type tE3DN = 1 << 17; |
337 | static const Type tSSE4a = 1 << 18; |
338 | static const Type tRDTSCP = 1 << 19; |
339 | static const Type tAVX2 = 1 << 20; |
340 | static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt |
341 | static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx |
342 | static const Type tLZCNT = 1 << 23; |
343 | |
344 | static const Type tINTEL = 1 << 24; |
345 | static const Type tAMD = 1 << 25; |
346 | |
347 | static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb |
348 | static const Type tRDRAND = 1 << 27; |
349 | static const Type tADX = 1 << 28; // adcx, adox |
350 | static const Type tRDSEED = 1 << 29; // rdseed |
351 | static const Type tSMAP = 1 << 30; // stac |
352 | static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest |
353 | static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort |
354 | static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph |
355 | static const Type tMOVBE = uint64(1) << 34; // mobve |
356 | static const Type tAVX512F = uint64(1) << 35; |
357 | static const Type tAVX512DQ = uint64(1) << 36; |
358 | static const Type tAVX512_IFMA = uint64(1) << 37; |
359 | static const Type tAVX512IFMA = tAVX512_IFMA; |
360 | static const Type tAVX512PF = uint64(1) << 38; |
361 | static const Type tAVX512ER = uint64(1) << 39; |
362 | static const Type tAVX512CD = uint64(1) << 40; |
363 | static const Type tAVX512BW = uint64(1) << 41; |
364 | static const Type tAVX512VL = uint64(1) << 42; |
365 | static const Type tAVX512_VBMI = uint64(1) << 43; |
366 | static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual |
367 | static const Type tAVX512_4VNNIW = uint64(1) << 44; |
368 | static const Type tAVX512_4FMAPS = uint64(1) << 45; |
369 | static const Type tPREFETCHWT1 = uint64(1) << 46; |
370 | static const Type tPREFETCHW = uint64(1) << 47; |
371 | static const Type tSHA = uint64(1) << 48; |
372 | static const Type tMPX = uint64(1) << 49; |
373 | static const Type tAVX512_VBMI2 = uint64(1) << 50; |
374 | static const Type tGFNI = uint64(1) << 51; |
375 | static const Type tVAES = uint64(1) << 52; |
376 | static const Type tVPCLMULQDQ = uint64(1) << 53; |
377 | static const Type tAVX512_VNNI = uint64(1) << 54; |
378 | static const Type tAVX512_BITALG = uint64(1) << 55; |
379 | static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56; |
380 | |
381 | Cpu() |
382 | : type_(NONE) |
383 | , x2APIC_supported_(false) |
384 | , numCores_() |
385 | , dataCacheSize_() |
386 | , coresSharignDataCache_() |
387 | , dataCacheLevels_(0) |
388 | { |
389 | unsigned int data[4] = {}; |
390 | const unsigned int& EAX = data[0]; |
391 | const unsigned int& EBX = data[1]; |
392 | const unsigned int& ECX = data[2]; |
393 | const unsigned int& EDX = data[3]; |
394 | getCpuid(0, data); |
395 | const unsigned int maxNum = EAX; |
396 | static const char intel[] = "ntel" ; |
397 | static const char amd[] = "cAMD" ; |
398 | if (ECX == get32bitAsBE(amd)) { |
399 | type_ |= tAMD; |
400 | getCpuid(0x80000001, data); |
401 | if (EDX & (1U << 31)) type_ |= t3DN; |
402 | if (EDX & (1U << 15)) type_ |= tCMOV; |
403 | if (EDX & (1U << 30)) type_ |= tE3DN; |
404 | if (EDX & (1U << 22)) type_ |= tMMX2; |
405 | if (EDX & (1U << 27)) type_ |= tRDTSCP; |
406 | } |
407 | if (ECX == get32bitAsBE(intel)) { |
408 | type_ |= tINTEL; |
409 | getCpuid(0x80000001, data); |
410 | if (EDX & (1U << 27)) type_ |= tRDTSCP; |
411 | if (ECX & (1U << 5)) type_ |= tLZCNT; |
412 | if (ECX & (1U << 8)) type_ |= tPREFETCHW; |
413 | } |
414 | getCpuid(1, data); |
415 | if (ECX & (1U << 0)) type_ |= tSSE3; |
416 | if (ECX & (1U << 9)) type_ |= tSSSE3; |
417 | if (ECX & (1U << 19)) type_ |= tSSE41; |
418 | if (ECX & (1U << 20)) type_ |= tSSE42; |
419 | if (ECX & (1U << 22)) type_ |= tMOVBE; |
420 | if (ECX & (1U << 23)) type_ |= tPOPCNT; |
421 | if (ECX & (1U << 25)) type_ |= tAESNI; |
422 | if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; |
423 | if (ECX & (1U << 27)) type_ |= tOSXSAVE; |
424 | if (ECX & (1U << 30)) type_ |= tRDRAND; |
425 | if (ECX & (1U << 29)) type_ |= tF16C; |
426 | |
427 | if (EDX & (1U << 15)) type_ |= tCMOV; |
428 | if (EDX & (1U << 23)) type_ |= tMMX; |
429 | if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; |
430 | if (EDX & (1U << 26)) type_ |= tSSE2; |
431 | |
432 | if (type_ & tOSXSAVE) { |
433 | // check XFEATURE_ENABLED_MASK[2:1] = '11b' |
434 | uint64 bv = getXfeature(); |
435 | if ((bv & 6) == 6) { |
436 | if (ECX & (1U << 28)) type_ |= tAVX; |
437 | if (ECX & (1U << 12)) type_ |= tFMA; |
438 | if (((bv >> 5) & 7) == 7) { |
439 | getCpuidEx(7, 0, data); |
440 | if (EBX & (1U << 16)) type_ |= tAVX512F; |
441 | if (type_ & tAVX512F) { |
442 | if (EBX & (1U << 17)) type_ |= tAVX512DQ; |
443 | if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; |
444 | if (EBX & (1U << 26)) type_ |= tAVX512PF; |
445 | if (EBX & (1U << 27)) type_ |= tAVX512ER; |
446 | if (EBX & (1U << 28)) type_ |= tAVX512CD; |
447 | if (EBX & (1U << 30)) type_ |= tAVX512BW; |
448 | if (EBX & (1U << 31)) type_ |= tAVX512VL; |
449 | if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; |
450 | if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; |
451 | if (ECX & (1U << 8)) type_ |= tGFNI; |
452 | if (ECX & (1U << 9)) type_ |= tVAES; |
453 | if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; |
454 | if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; |
455 | if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; |
456 | if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; |
457 | if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; |
458 | if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; |
459 | } |
460 | } |
461 | } |
462 | } |
463 | if (maxNum >= 7) { |
464 | getCpuidEx(7, 0, data); |
465 | if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; |
466 | if (EBX & (1U << 3)) type_ |= tBMI1; |
467 | if (EBX & (1U << 8)) type_ |= tBMI2; |
468 | if (EBX & (1U << 9)) type_ |= tENHANCED_REP; |
469 | if (EBX & (1U << 18)) type_ |= tRDSEED; |
470 | if (EBX & (1U << 19)) type_ |= tADX; |
471 | if (EBX & (1U << 20)) type_ |= tSMAP; |
472 | if (EBX & (1U << 4)) type_ |= tHLE; |
473 | if (EBX & (1U << 11)) type_ |= tRTM; |
474 | if (EBX & (1U << 14)) type_ |= tMPX; |
475 | if (EBX & (1U << 29)) type_ |= tSHA; |
476 | if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; |
477 | } |
478 | setFamily(); |
479 | setNumCores(); |
480 | setCacheHierarchy(); |
481 | } |
482 | void putFamily() const |
483 | { |
484 | printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n" , |
485 | family, model, stepping, extFamily, extModel); |
486 | printf("display:family=%X, model=%X\n" , displayFamily, displayModel); |
487 | } |
488 | bool has(Type type) const |
489 | { |
490 | return (type & type_) != 0; |
491 | } |
492 | }; |
493 | |
494 | class Clock { |
495 | public: |
496 | static inline uint64 getRdtsc() |
497 | { |
498 | #ifdef XBYAK_INTEL_CPU_SPECIFIC |
499 | #ifdef _MSC_VER |
500 | return __rdtsc(); |
501 | #else |
502 | unsigned int eax, edx; |
503 | __asm__ volatile("rdtsc" : "=a" (eax), "=d" (edx)); |
504 | return ((uint64)edx << 32) | eax; |
505 | #endif |
506 | #else |
507 | // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu |
508 | return 0; |
509 | #endif |
510 | } |
511 | Clock() |
512 | : clock_(0) |
513 | , count_(0) |
514 | { |
515 | } |
516 | void begin() |
517 | { |
518 | clock_ -= getRdtsc(); |
519 | } |
520 | void end() |
521 | { |
522 | clock_ += getRdtsc(); |
523 | count_++; |
524 | } |
525 | int getCount() const { return count_; } |
526 | uint64 getClock() const { return clock_; } |
527 | void clear() { count_ = 0; clock_ = 0; } |
528 | private: |
529 | uint64 clock_; |
530 | int count_; |
531 | }; |
532 | |
533 | #ifdef XBYAK64 |
534 | const int UseRCX = 1 << 6; |
535 | const int UseRDX = 1 << 7; |
536 | |
537 | class Pack { |
538 | static const size_t maxTblNum = 15; |
539 | const Xbyak::Reg64 *tbl_[maxTblNum]; |
540 | size_t n_; |
541 | public: |
542 | Pack() : tbl_(), n_(0) {} |
543 | Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); } |
544 | Pack(const Pack& rhs) |
545 | : n_(rhs.n_) |
546 | { |
547 | for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; |
548 | } |
549 | Pack& operator=(const Pack& rhs) |
550 | { |
551 | n_ = rhs.n_; |
552 | for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; |
553 | return *this; |
554 | } |
555 | Pack(const Xbyak::Reg64& t0) |
556 | { n_ = 1; tbl_[0] = &t0; } |
557 | Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
558 | { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; } |
559 | Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
560 | { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; } |
561 | Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
562 | { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; } |
563 | Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
564 | { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; } |
565 | Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
566 | { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; } |
567 | Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
568 | { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; } |
569 | Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
570 | { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; } |
571 | Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
572 | { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; } |
573 | Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) |
574 | { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; } |
575 | Pack& append(const Xbyak::Reg64& t) |
576 | { |
577 | if (n_ == maxTblNum) { |
578 | fprintf(stderr, "ERR Pack::can't append\n" ); |
579 | throw Error(ERR_BAD_PARAMETER); |
580 | } |
581 | tbl_[n_++] = &t; |
582 | return *this; |
583 | } |
584 | void init(const Xbyak::Reg64 *tbl, size_t n) |
585 | { |
586 | if (n > maxTblNum) { |
587 | fprintf(stderr, "ERR Pack::init bad n=%d\n" , (int)n); |
588 | throw Error(ERR_BAD_PARAMETER); |
589 | } |
590 | n_ = n; |
591 | for (size_t i = 0; i < n; i++) { |
592 | tbl_[i] = &tbl[i]; |
593 | } |
594 | } |
595 | const Xbyak::Reg64& operator[](size_t n) const |
596 | { |
597 | if (n >= n_) { |
598 | fprintf(stderr, "ERR Pack bad n=%d(%d)\n" , (int)n, (int)n_); |
599 | throw Error(ERR_BAD_PARAMETER); |
600 | } |
601 | return *tbl_[n]; |
602 | } |
603 | size_t size() const { return n_; } |
604 | /* |
605 | get tbl[pos, pos + num) |
606 | */ |
607 | Pack sub(size_t pos, size_t num = size_t(-1)) const |
608 | { |
609 | if (num == size_t(-1)) num = n_ - pos; |
610 | if (pos + num > n_) { |
611 | fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n" , (int)pos, (int)num); |
612 | throw Error(ERR_BAD_PARAMETER); |
613 | } |
614 | Pack pack; |
615 | pack.n_ = num; |
616 | for (size_t i = 0; i < num; i++) { |
617 | pack.tbl_[i] = tbl_[pos + i]; |
618 | } |
619 | return pack; |
620 | } |
621 | void put() const |
622 | { |
623 | for (size_t i = 0; i < n_; i++) { |
624 | printf("%s " , tbl_[i]->toString()); |
625 | } |
626 | printf("\n" ); |
627 | } |
628 | }; |
629 | |
630 | class StackFrame { |
631 | #ifdef XBYAK64_WIN |
632 | static const int noSaveNum = 6; |
633 | static const int rcxPos = 0; |
634 | static const int rdxPos = 1; |
635 | #else |
636 | static const int noSaveNum = 8; |
637 | static const int rcxPos = 3; |
638 | static const int rdxPos = 2; |
639 | #endif |
640 | static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax |
641 | Xbyak::CodeGenerator *code_; |
642 | int pNum_; |
643 | int tNum_; |
644 | bool useRcx_; |
645 | bool useRdx_; |
646 | int saveNum_; |
647 | int P_; |
648 | bool makeEpilog_; |
649 | Xbyak::Reg64 pTbl_[4]; |
650 | Xbyak::Reg64 tTbl_[maxRegNum]; |
651 | Pack p_; |
652 | Pack t_; |
653 | StackFrame(const StackFrame&); |
654 | void operator=(const StackFrame&); |
655 | public: |
656 | const Pack& p; |
657 | const Pack& t; |
658 | /* |
659 | make stack frame |
660 | @param sf [in] this |
661 | @param pNum [in] num of function parameter(0 <= pNum <= 4) |
662 | @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 |
663 | @param stackSizeByte [in] local stack size |
664 | @param makeEpilog [in] automatically call close() if true |
665 | |
666 | you can use |
667 | rax |
668 | gp0, ..., gp(pNum - 1) |
669 | gt0, ..., gt(tNum-1) |
670 | rcx if tNum & UseRCX |
671 | rdx if tNum & UseRDX |
672 | rsp[0..stackSizeByte - 1] |
673 | */ |
674 | StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true) |
675 | : code_(code) |
676 | , pNum_(pNum) |
677 | , tNum_(tNum & ~(UseRCX | UseRDX)) |
678 | , useRcx_((tNum & UseRCX) != 0) |
679 | , useRdx_((tNum & UseRDX) != 0) |
680 | , saveNum_(0) |
681 | , P_(0) |
682 | , makeEpilog_(makeEpilog) |
683 | , p(p_) |
684 | , t(t_) |
685 | { |
686 | using namespace Xbyak; |
687 | if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM); |
688 | const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); |
689 | if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM); |
690 | const Reg64& _rsp = code->rsp; |
691 | saveNum_ = (std::max)(0, allRegNum - noSaveNum); |
692 | const int *tbl = getOrderTbl() + noSaveNum; |
693 | for (int i = 0; i < saveNum_; i++) { |
694 | code->push(Reg64(tbl[i])); |
695 | } |
696 | P_ = (stackSizeByte + 7) / 8; |
697 | if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment |
698 | P_ *= 8; |
699 | if (P_ > 0) code->sub(_rsp, P_); |
700 | int pos = 0; |
701 | for (int i = 0; i < pNum; i++) { |
702 | pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); |
703 | } |
704 | for (int i = 0; i < tNum_; i++) { |
705 | tTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); |
706 | } |
707 | if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx); |
708 | if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx); |
709 | p_.init(pTbl_, pNum); |
710 | t_.init(tTbl_, tNum_); |
711 | } |
712 | /* |
713 | make epilog manually |
714 | @param callRet [in] call ret() if true |
715 | */ |
716 | void close(bool callRet = true) |
717 | { |
718 | using namespace Xbyak; |
719 | const Reg64& _rsp = code_->rsp; |
720 | const int *tbl = getOrderTbl() + noSaveNum; |
721 | if (P_ > 0) code_->add(_rsp, P_); |
722 | for (int i = 0; i < saveNum_; i++) { |
723 | code_->pop(Reg64(tbl[saveNum_ - 1 - i])); |
724 | } |
725 | |
726 | if (callRet) code_->ret(); |
727 | } |
728 | ~StackFrame() |
729 | { |
730 | if (!makeEpilog_) return; |
731 | try { |
732 | close(); |
733 | } catch (std::exception& e) { |
734 | printf("ERR:StackFrame %s\n" , e.what()); |
735 | //exit(1); |
736 | } |
737 | } |
738 | private: |
739 | const int *getOrderTbl() const |
740 | { |
741 | using namespace Xbyak; |
742 | static const int tbl[] = { |
743 | #ifdef XBYAK64_WIN |
744 | Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI, |
745 | #else |
746 | Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, |
747 | #endif |
748 | Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15 |
749 | }; |
750 | return &tbl[0]; |
751 | } |
752 | int getRegIdx(int& pos) const |
753 | { |
754 | assert(pos < maxRegNum); |
755 | using namespace Xbyak; |
756 | const int *tbl = getOrderTbl(); |
757 | int r = tbl[pos++]; |
758 | if (useRcx_) { |
759 | if (r == Operand::RCX) { return Operand::R10; } |
760 | if (r == Operand::R10) { r = tbl[pos++]; } |
761 | } |
762 | if (useRdx_) { |
763 | if (r == Operand::RDX) { return Operand::R11; } |
764 | if (r == Operand::R11) { return tbl[pos++]; } |
765 | } |
766 | return r; |
767 | } |
768 | }; |
769 | #endif |
770 | |
771 | } } // end of util |
772 | #endif |
773 | |