1/*******************************************************************************
2* Copyright 2016-2019 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/*******************************************************************************
18* Copyright (c) 2007 MITSUNARI Shigeo
19* All rights reserved.
20*
21* Redistribution and use in source and binary forms, with or without
22* modification, are permitted provided that the following conditions are met:
23*
24* Redistributions of source code must retain the above copyright notice, this
25* list of conditions and the following disclaimer.
26* Redistributions in binary form must reproduce the above copyright notice,
27* this list of conditions and the following disclaimer in the documentation
28* and/or other materials provided with the distribution.
29* Neither the name of the copyright owner nor the names of its contributors may
30* be used to endorse or promote products derived from this software without
31* specific prior written permission.
32*
33* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
34* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
37* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
38* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
39* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
41* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
42* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
43* THE POSSIBILITY OF SUCH DAMAGE.
44*******************************************************************************/
45
46#ifndef XBYAK_XBYAK_UTIL_H_
47#define XBYAK_XBYAK_UTIL_H_
48
49/**
50 utility class and functions for Xbyak
51 Xbyak::util::Clock ; rdtsc timer
52 Xbyak::util::Cpu ; detect CPU
53 @note this header is UNDER CONSTRUCTION!
54*/
55#include "xbyak.h"
56
57#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
58 #define XBYAK_INTEL_CPU_SPECIFIC
59#endif
60
61#ifdef XBYAK_INTEL_CPU_SPECIFIC
62#ifdef _MSC_VER
63 #if (_MSC_VER < 1400) && defined(XBYAK32)
64 static inline __declspec(naked) void __cpuid(int[4], int)
65 {
66 __asm {
67 push ebx
68 push esi
69 mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
70 cpuid
71 mov esi, dword ptr [esp + 4 * 2 + 4] // data
72 mov dword ptr [esi], eax
73 mov dword ptr [esi + 4], ebx
74 mov dword ptr [esi + 8], ecx
75 mov dword ptr [esi + 12], edx
76 pop esi
77 pop ebx
78 ret
79 }
80 }
81 #else
82 #include <intrin.h> // for __cpuid
83 #endif
84#else
85 #ifndef __GNUC_PREREQ
86 #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
87 #endif
88 #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
89 #include <cpuid.h>
90 #else
91 #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
92 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
93 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
94 #else
95 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
96 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
97 #endif
98 #endif
99#endif
100#endif
101
102namespace Xbyak { namespace util {
103
104typedef enum {
105 SmtLevel = 1,
106 CoreLevel = 2
107} IntelCpuTopologyLevel;
108
109/**
110 CPU detection class
111*/
112class Cpu {
113 uint64 type_;
114 //system topology
115 bool x2APIC_supported_;
116 static const size_t maxTopologyLevels = 2;
117 unsigned int numCores_[maxTopologyLevels];
118
119 static const unsigned int maxNumberCacheLevels = 10;
120 unsigned int dataCacheSize_[maxNumberCacheLevels];
121 unsigned int coresSharignDataCache_[maxNumberCacheLevels];
122 unsigned int dataCacheLevels_;
123
124 unsigned int get32bitAsBE(const char *x) const
125 {
126 return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
127 }
128 unsigned int mask(int n) const
129 {
130 return (1U << n) - 1;
131 }
132 void setFamily()
133 {
134 unsigned int data[4] = {};
135 getCpuid(1, data);
136 stepping = data[0] & mask(4);
137 model = (data[0] >> 4) & mask(4);
138 family = (data[0] >> 8) & mask(4);
139 // type = (data[0] >> 12) & mask(2);
140 extModel = (data[0] >> 16) & mask(4);
141 extFamily = (data[0] >> 20) & mask(8);
142 if (family == 0x0f) {
143 displayFamily = family + extFamily;
144 } else {
145 displayFamily = family;
146 }
147 if (family == 6 || family == 0x0f) {
148 displayModel = (extModel << 4) + model;
149 } else {
150 displayModel = model;
151 }
152 }
153 unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
154 {
155 return (val >> base) & ((1u << (end - base)) - 1);
156 }
157 void setNumCores()
158 {
159 if ((type_ & tINTEL) == 0) return;
160
161 unsigned int data[4] = {};
162
163 /* CAUTION: These numbers are configuration as shipped by Intel. */
164 getCpuidEx(0x0, 0, data);
165 if (data[0] >= 0xB) {
166 /*
167 if leaf 11 exists(x2APIC is supported),
168 we use it to get the number of smt cores and cores on socket
169
170 leaf 0xB can be zeroed-out by a hypervisor
171 */
172 x2APIC_supported_ = true;
173 for (unsigned int i = 0; i < maxTopologyLevels; i++) {
174 getCpuidEx(0xB, i, data);
175 IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
176 if (level == SmtLevel || level == CoreLevel) {
177 numCores_[level - 1] = extractBit(data[1], 0, 15);
178 }
179 }
180 } else {
181 /*
182 Failed to deremine num of cores without x2APIC support.
183 TODO: USE initial APIC ID to determine ncores.
184 */
185 numCores_[SmtLevel - 1] = 0;
186 numCores_[CoreLevel - 1] = 0;
187 }
188
189 }
190 void setCacheHierarchy()
191 {
192 if ((type_ & tINTEL) == 0) return;
193 const unsigned int NO_CACHE = 0;
194 const unsigned int DATA_CACHE = 1;
195// const unsigned int INSTRUCTION_CACHE = 2;
196 const unsigned int UNIFIED_CACHE = 3;
197 unsigned int smt_width = 0;
198 unsigned int logical_cores = 0;
199 unsigned int data[4] = {};
200
201 if (x2APIC_supported_) {
202 smt_width = numCores_[0];
203 logical_cores = numCores_[1];
204 }
205
206 /*
207 Assumptions:
208 the first level of data cache is not shared (which is the
209 case for every existing architecture) and use this to
210 determine the SMT width for arch not supporting leaf 11.
211 when leaf 4 reports a number of core less than numCores_
212 on socket reported by leaf 11, then it is a correct number
213 of cores not an upperbound.
214 */
215 for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
216 getCpuidEx(0x4, i, data);
217 unsigned int cacheType = extractBit(data[0], 0, 4);
218 if (cacheType == NO_CACHE) break;
219 if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
220 unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
221 if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
222 actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
223 }
224 assert(actual_logical_cores != 0);
225 dataCacheSize_[dataCacheLevels_] =
226 (extractBit(data[1], 22, 31) + 1)
227 * (extractBit(data[1], 12, 21) + 1)
228 * (extractBit(data[1], 0, 11) + 1)
229 * (data[2] + 1);
230 if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
231 assert(smt_width != 0);
232 // FIXME: check and fix number of cores sharing L3 cache for different configurations
233 // (HT-, 2 sockets), (HT-, 1 socket), (HT+, 2 sockets), (HT+, 1 socket)
234 coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
235 dataCacheLevels_++;
236 }
237 }
238 }
239
240public:
241 int model;
242 int family;
243 int stepping;
244 int extModel;
245 int extFamily;
246 int displayFamily; // family + extFamily
247 int displayModel; // model + extModel
248
249 unsigned int getNumCores(IntelCpuTopologyLevel level) {
250 if (level != SmtLevel && level != CoreLevel) throw Error(ERR_BAD_PARAMETER);
251 if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
252 return (level == CoreLevel)
253 ? numCores_[level - 1] / numCores_[SmtLevel - 1]
254 : numCores_[level - 1];
255 }
256
257 unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
258 unsigned int getCoresSharingDataCache(unsigned int i) const
259 {
260 if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
261 return coresSharignDataCache_[i];
262 }
263 unsigned int getDataCacheSize(unsigned int i) const
264 {
265 if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
266 return dataCacheSize_[i];
267 }
268
269 /*
270 data[] = { eax, ebx, ecx, edx }
271 */
272 static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
273 {
274#ifdef XBYAK_INTEL_CPU_SPECIFIC
275 #ifdef _MSC_VER
276 __cpuid(reinterpret_cast<int*>(data), eaxIn);
277 #else
278 __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
279 #endif
280#else
281 (void)eaxIn;
282 (void)data;
283#endif
284 }
285 static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
286 {
287#ifdef XBYAK_INTEL_CPU_SPECIFIC
288 #ifdef _MSC_VER
289 __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
290 #else
291 __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
292 #endif
293#else
294 (void)eaxIn;
295 (void)ecxIn;
296 (void)data;
297#endif
298 }
299 static inline uint64 getXfeature()
300 {
301#ifdef XBYAK_INTEL_CPU_SPECIFIC
302 #ifdef _MSC_VER
303 return _xgetbv(0);
304 #else
305 unsigned int eax, edx;
306 // xgetvb is not support on gcc 4.2
307// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
308 __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
309 return ((uint64)edx << 32) | eax;
310 #endif
311#else
312 return 0;
313#endif
314 }
315 typedef uint64 Type;
316
317 static const Type NONE = 0;
318 static const Type tMMX = 1 << 0;
319 static const Type tMMX2 = 1 << 1;
320 static const Type tCMOV = 1 << 2;
321 static const Type tSSE = 1 << 3;
322 static const Type tSSE2 = 1 << 4;
323 static const Type tSSE3 = 1 << 5;
324 static const Type tSSSE3 = 1 << 6;
325 static const Type tSSE41 = 1 << 7;
326 static const Type tSSE42 = 1 << 8;
327 static const Type tPOPCNT = 1 << 9;
328 static const Type tAESNI = 1 << 10;
329 static const Type tSSE5 = 1 << 11;
330 static const Type tOSXSAVE = 1 << 12;
331 static const Type tPCLMULQDQ = 1 << 13;
332 static const Type tAVX = 1 << 14;
333 static const Type tFMA = 1 << 15;
334
335 static const Type t3DN = 1 << 16;
336 static const Type tE3DN = 1 << 17;
337 static const Type tSSE4a = 1 << 18;
338 static const Type tRDTSCP = 1 << 19;
339 static const Type tAVX2 = 1 << 20;
340 static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
341 static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
342 static const Type tLZCNT = 1 << 23;
343
344 static const Type tINTEL = 1 << 24;
345 static const Type tAMD = 1 << 25;
346
347 static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
348 static const Type tRDRAND = 1 << 27;
349 static const Type tADX = 1 << 28; // adcx, adox
350 static const Type tRDSEED = 1 << 29; // rdseed
351 static const Type tSMAP = 1 << 30; // stac
352 static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
353 static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
354 static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
355 static const Type tMOVBE = uint64(1) << 34; // mobve
356 static const Type tAVX512F = uint64(1) << 35;
357 static const Type tAVX512DQ = uint64(1) << 36;
358 static const Type tAVX512_IFMA = uint64(1) << 37;
359 static const Type tAVX512IFMA = tAVX512_IFMA;
360 static const Type tAVX512PF = uint64(1) << 38;
361 static const Type tAVX512ER = uint64(1) << 39;
362 static const Type tAVX512CD = uint64(1) << 40;
363 static const Type tAVX512BW = uint64(1) << 41;
364 static const Type tAVX512VL = uint64(1) << 42;
365 static const Type tAVX512_VBMI = uint64(1) << 43;
366 static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
367 static const Type tAVX512_4VNNIW = uint64(1) << 44;
368 static const Type tAVX512_4FMAPS = uint64(1) << 45;
369 static const Type tPREFETCHWT1 = uint64(1) << 46;
370 static const Type tPREFETCHW = uint64(1) << 47;
371 static const Type tSHA = uint64(1) << 48;
372 static const Type tMPX = uint64(1) << 49;
373 static const Type tAVX512_VBMI2 = uint64(1) << 50;
374 static const Type tGFNI = uint64(1) << 51;
375 static const Type tVAES = uint64(1) << 52;
376 static const Type tVPCLMULQDQ = uint64(1) << 53;
377 static const Type tAVX512_VNNI = uint64(1) << 54;
378 static const Type tAVX512_BITALG = uint64(1) << 55;
379 static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
380
381 Cpu()
382 : type_(NONE)
383 , x2APIC_supported_(false)
384 , numCores_()
385 , dataCacheSize_()
386 , coresSharignDataCache_()
387 , dataCacheLevels_(0)
388 {
389 unsigned int data[4] = {};
390 const unsigned int& EAX = data[0];
391 const unsigned int& EBX = data[1];
392 const unsigned int& ECX = data[2];
393 const unsigned int& EDX = data[3];
394 getCpuid(0, data);
395 const unsigned int maxNum = EAX;
396 static const char intel[] = "ntel";
397 static const char amd[] = "cAMD";
398 if (ECX == get32bitAsBE(amd)) {
399 type_ |= tAMD;
400 getCpuid(0x80000001, data);
401 if (EDX & (1U << 31)) type_ |= t3DN;
402 if (EDX & (1U << 15)) type_ |= tCMOV;
403 if (EDX & (1U << 30)) type_ |= tE3DN;
404 if (EDX & (1U << 22)) type_ |= tMMX2;
405 if (EDX & (1U << 27)) type_ |= tRDTSCP;
406 }
407 if (ECX == get32bitAsBE(intel)) {
408 type_ |= tINTEL;
409 getCpuid(0x80000001, data);
410 if (EDX & (1U << 27)) type_ |= tRDTSCP;
411 if (ECX & (1U << 5)) type_ |= tLZCNT;
412 if (ECX & (1U << 8)) type_ |= tPREFETCHW;
413 }
414 getCpuid(1, data);
415 if (ECX & (1U << 0)) type_ |= tSSE3;
416 if (ECX & (1U << 9)) type_ |= tSSSE3;
417 if (ECX & (1U << 19)) type_ |= tSSE41;
418 if (ECX & (1U << 20)) type_ |= tSSE42;
419 if (ECX & (1U << 22)) type_ |= tMOVBE;
420 if (ECX & (1U << 23)) type_ |= tPOPCNT;
421 if (ECX & (1U << 25)) type_ |= tAESNI;
422 if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
423 if (ECX & (1U << 27)) type_ |= tOSXSAVE;
424 if (ECX & (1U << 30)) type_ |= tRDRAND;
425 if (ECX & (1U << 29)) type_ |= tF16C;
426
427 if (EDX & (1U << 15)) type_ |= tCMOV;
428 if (EDX & (1U << 23)) type_ |= tMMX;
429 if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
430 if (EDX & (1U << 26)) type_ |= tSSE2;
431
432 if (type_ & tOSXSAVE) {
433 // check XFEATURE_ENABLED_MASK[2:1] = '11b'
434 uint64 bv = getXfeature();
435 if ((bv & 6) == 6) {
436 if (ECX & (1U << 28)) type_ |= tAVX;
437 if (ECX & (1U << 12)) type_ |= tFMA;
438 if (((bv >> 5) & 7) == 7) {
439 getCpuidEx(7, 0, data);
440 if (EBX & (1U << 16)) type_ |= tAVX512F;
441 if (type_ & tAVX512F) {
442 if (EBX & (1U << 17)) type_ |= tAVX512DQ;
443 if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
444 if (EBX & (1U << 26)) type_ |= tAVX512PF;
445 if (EBX & (1U << 27)) type_ |= tAVX512ER;
446 if (EBX & (1U << 28)) type_ |= tAVX512CD;
447 if (EBX & (1U << 30)) type_ |= tAVX512BW;
448 if (EBX & (1U << 31)) type_ |= tAVX512VL;
449 if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
450 if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
451 if (ECX & (1U << 8)) type_ |= tGFNI;
452 if (ECX & (1U << 9)) type_ |= tVAES;
453 if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
454 if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
455 if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
456 if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
457 if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
458 if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
459 }
460 }
461 }
462 }
463 if (maxNum >= 7) {
464 getCpuidEx(7, 0, data);
465 if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
466 if (EBX & (1U << 3)) type_ |= tBMI1;
467 if (EBX & (1U << 8)) type_ |= tBMI2;
468 if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
469 if (EBX & (1U << 18)) type_ |= tRDSEED;
470 if (EBX & (1U << 19)) type_ |= tADX;
471 if (EBX & (1U << 20)) type_ |= tSMAP;
472 if (EBX & (1U << 4)) type_ |= tHLE;
473 if (EBX & (1U << 11)) type_ |= tRTM;
474 if (EBX & (1U << 14)) type_ |= tMPX;
475 if (EBX & (1U << 29)) type_ |= tSHA;
476 if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
477 }
478 setFamily();
479 setNumCores();
480 setCacheHierarchy();
481 }
482 void putFamily() const
483 {
484 printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
485 family, model, stepping, extFamily, extModel);
486 printf("display:family=%X, model=%X\n", displayFamily, displayModel);
487 }
488 bool has(Type type) const
489 {
490 return (type & type_) != 0;
491 }
492};
493
494class Clock {
495public:
496 static inline uint64 getRdtsc()
497 {
498#ifdef XBYAK_INTEL_CPU_SPECIFIC
499 #ifdef _MSC_VER
500 return __rdtsc();
501 #else
502 unsigned int eax, edx;
503 __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
504 return ((uint64)edx << 32) | eax;
505 #endif
506#else
507 // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
508 return 0;
509#endif
510 }
511 Clock()
512 : clock_(0)
513 , count_(0)
514 {
515 }
516 void begin()
517 {
518 clock_ -= getRdtsc();
519 }
520 void end()
521 {
522 clock_ += getRdtsc();
523 count_++;
524 }
525 int getCount() const { return count_; }
526 uint64 getClock() const { return clock_; }
527 void clear() { count_ = 0; clock_ = 0; }
528private:
529 uint64 clock_;
530 int count_;
531};
532
533#ifdef XBYAK64
534const int UseRCX = 1 << 6;
535const int UseRDX = 1 << 7;
536
537class Pack {
538 static const size_t maxTblNum = 15;
539 const Xbyak::Reg64 *tbl_[maxTblNum];
540 size_t n_;
541public:
542 Pack() : tbl_(), n_(0) {}
543 Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
544 Pack(const Pack& rhs)
545 : n_(rhs.n_)
546 {
547 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
548 }
549 Pack& operator=(const Pack& rhs)
550 {
551 n_ = rhs.n_;
552 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
553 return *this;
554 }
555 Pack(const Xbyak::Reg64& t0)
556 { n_ = 1; tbl_[0] = &t0; }
557 Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
558 { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
559 Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
560 { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
561 Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
562 { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
563 Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
564 { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
565 Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
566 { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
567 Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
568 { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
569 Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
570 { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
571 Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
572 { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
573 Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
574 { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
575 Pack& append(const Xbyak::Reg64& t)
576 {
577 if (n_ == maxTblNum) {
578 fprintf(stderr, "ERR Pack::can't append\n");
579 throw Error(ERR_BAD_PARAMETER);
580 }
581 tbl_[n_++] = &t;
582 return *this;
583 }
584 void init(const Xbyak::Reg64 *tbl, size_t n)
585 {
586 if (n > maxTblNum) {
587 fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
588 throw Error(ERR_BAD_PARAMETER);
589 }
590 n_ = n;
591 for (size_t i = 0; i < n; i++) {
592 tbl_[i] = &tbl[i];
593 }
594 }
595 const Xbyak::Reg64& operator[](size_t n) const
596 {
597 if (n >= n_) {
598 fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
599 throw Error(ERR_BAD_PARAMETER);
600 }
601 return *tbl_[n];
602 }
603 size_t size() const { return n_; }
604 /*
605 get tbl[pos, pos + num)
606 */
607 Pack sub(size_t pos, size_t num = size_t(-1)) const
608 {
609 if (num == size_t(-1)) num = n_ - pos;
610 if (pos + num > n_) {
611 fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
612 throw Error(ERR_BAD_PARAMETER);
613 }
614 Pack pack;
615 pack.n_ = num;
616 for (size_t i = 0; i < num; i++) {
617 pack.tbl_[i] = tbl_[pos + i];
618 }
619 return pack;
620 }
621 void put() const
622 {
623 for (size_t i = 0; i < n_; i++) {
624 printf("%s ", tbl_[i]->toString());
625 }
626 printf("\n");
627 }
628};
629
630class StackFrame {
631#ifdef XBYAK64_WIN
632 static const int noSaveNum = 6;
633 static const int rcxPos = 0;
634 static const int rdxPos = 1;
635#else
636 static const int noSaveNum = 8;
637 static const int rcxPos = 3;
638 static const int rdxPos = 2;
639#endif
640 static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
641 Xbyak::CodeGenerator *code_;
642 int pNum_;
643 int tNum_;
644 bool useRcx_;
645 bool useRdx_;
646 int saveNum_;
647 int P_;
648 bool makeEpilog_;
649 Xbyak::Reg64 pTbl_[4];
650 Xbyak::Reg64 tTbl_[maxRegNum];
651 Pack p_;
652 Pack t_;
653 StackFrame(const StackFrame&);
654 void operator=(const StackFrame&);
655public:
656 const Pack& p;
657 const Pack& t;
658 /*
659 make stack frame
660 @param sf [in] this
661 @param pNum [in] num of function parameter(0 <= pNum <= 4)
662 @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
663 @param stackSizeByte [in] local stack size
664 @param makeEpilog [in] automatically call close() if true
665
666 you can use
667 rax
668 gp0, ..., gp(pNum - 1)
669 gt0, ..., gt(tNum-1)
670 rcx if tNum & UseRCX
671 rdx if tNum & UseRDX
672 rsp[0..stackSizeByte - 1]
673 */
674 StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
675 : code_(code)
676 , pNum_(pNum)
677 , tNum_(tNum & ~(UseRCX | UseRDX))
678 , useRcx_((tNum & UseRCX) != 0)
679 , useRdx_((tNum & UseRDX) != 0)
680 , saveNum_(0)
681 , P_(0)
682 , makeEpilog_(makeEpilog)
683 , p(p_)
684 , t(t_)
685 {
686 using namespace Xbyak;
687 if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
688 const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
689 if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
690 const Reg64& _rsp = code->rsp;
691 saveNum_ = (std::max)(0, allRegNum - noSaveNum);
692 const int *tbl = getOrderTbl() + noSaveNum;
693 for (int i = 0; i < saveNum_; i++) {
694 code->push(Reg64(tbl[i]));
695 }
696 P_ = (stackSizeByte + 7) / 8;
697 if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
698 P_ *= 8;
699 if (P_ > 0) code->sub(_rsp, P_);
700 int pos = 0;
701 for (int i = 0; i < pNum; i++) {
702 pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
703 }
704 for (int i = 0; i < tNum_; i++) {
705 tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
706 }
707 if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
708 if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
709 p_.init(pTbl_, pNum);
710 t_.init(tTbl_, tNum_);
711 }
712 /*
713 make epilog manually
714 @param callRet [in] call ret() if true
715 */
716 void close(bool callRet = true)
717 {
718 using namespace Xbyak;
719 const Reg64& _rsp = code_->rsp;
720 const int *tbl = getOrderTbl() + noSaveNum;
721 if (P_ > 0) code_->add(_rsp, P_);
722 for (int i = 0; i < saveNum_; i++) {
723 code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
724 }
725
726 if (callRet) code_->ret();
727 }
728 ~StackFrame()
729 {
730 if (!makeEpilog_) return;
731 try {
732 close();
733 } catch (std::exception& e) {
734 printf("ERR:StackFrame %s\n", e.what());
735 //exit(1);
736 }
737 }
738private:
739 const int *getOrderTbl() const
740 {
741 using namespace Xbyak;
742 static const int tbl[] = {
743#ifdef XBYAK64_WIN
744 Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
745#else
746 Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
747#endif
748 Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
749 };
750 return &tbl[0];
751 }
752 int getRegIdx(int& pos) const
753 {
754 assert(pos < maxRegNum);
755 using namespace Xbyak;
756 const int *tbl = getOrderTbl();
757 int r = tbl[pos++];
758 if (useRcx_) {
759 if (r == Operand::RCX) { return Operand::R10; }
760 if (r == Operand::R10) { r = tbl[pos++]; }
761 }
762 if (useRdx_) {
763 if (r == Operand::RDX) { return Operand::R11; }
764 if (r == Operand::R11) { return tbl[pos++]; }
765 }
766 return r;
767 }
768};
769#endif
770
771} } // end of util
772#endif
773