| 1 | /******************************************************************************* | 
| 2 | * Copyright 2016-2019 Intel Corporation | 
| 3 | * | 
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); | 
| 5 | * you may not use this file except in compliance with the License. | 
| 6 | * You may obtain a copy of the License at | 
| 7 | * | 
| 8 | *     http://www.apache.org/licenses/LICENSE-2.0 | 
| 9 | * | 
| 10 | * Unless required by applicable law or agreed to in writing, software | 
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, | 
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 13 | * See the License for the specific language governing permissions and | 
| 14 | * limitations under the License. | 
| 15 | *******************************************************************************/ | 
| 16 |  | 
| 17 | /******************************************************************************* | 
| 18 | * Copyright (c) 2007 MITSUNARI Shigeo | 
| 19 | * All rights reserved. | 
| 20 | * | 
| 21 | * Redistribution and use in source and binary forms, with or without | 
| 22 | * modification, are permitted provided that the following conditions are met: | 
| 23 | * | 
| 24 | * Redistributions of source code must retain the above copyright notice, this | 
| 25 | * list of conditions and the following disclaimer. | 
| 26 | * Redistributions in binary form must reproduce the above copyright notice, | 
| 27 | * this list of conditions and the following disclaimer in the documentation | 
| 28 | * and/or other materials provided with the distribution. | 
| 29 | * Neither the name of the copyright owner nor the names of its contributors may | 
| 30 | * be used to endorse or promote products derived from this software without | 
| 31 | * specific prior written permission. | 
| 32 | * | 
| 33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
| 34 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 35 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 36 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
| 37 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
| 38 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
| 39 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
| 40 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
| 41 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
| 42 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | 
| 43 | * THE POSSIBILITY OF SUCH DAMAGE. | 
| 44 | *******************************************************************************/ | 
| 45 |  | 
| 46 | #ifndef XBYAK_XBYAK_UTIL_H_ | 
| 47 | #define XBYAK_XBYAK_UTIL_H_ | 
| 48 |  | 
| 49 | /** | 
| 50 | 	utility class and functions for Xbyak | 
| 51 | 	Xbyak::util::Clock ; rdtsc timer | 
| 52 | 	Xbyak::util::Cpu ; detect CPU | 
| 53 | 	@note this header is UNDER CONSTRUCTION! | 
| 54 | */ | 
| 55 | #include "xbyak.h" | 
| 56 |  | 
| 57 | #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) | 
| 58 | 	#define XBYAK_INTEL_CPU_SPECIFIC | 
| 59 | #endif | 
| 60 |  | 
| 61 | #ifdef XBYAK_INTEL_CPU_SPECIFIC | 
| 62 | #ifdef _MSC_VER | 
| 63 | 	#if (_MSC_VER < 1400) && defined(XBYAK32) | 
| 64 | 		static inline __declspec(naked) void __cpuid(int[4], int) | 
| 65 | 		{ | 
| 66 | 			__asm { | 
| 67 | 				push	ebx | 
| 68 | 				push	esi | 
| 69 | 				mov		eax, dword ptr [esp + 4 * 2 + 8] // eaxIn | 
| 70 | 				cpuid | 
| 71 | 				mov		esi, dword ptr [esp + 4 * 2 + 4] // data | 
| 72 | 				mov		dword ptr [esi], eax | 
| 73 | 				mov		dword ptr [esi + 4], ebx | 
| 74 | 				mov		dword ptr [esi + 8], ecx | 
| 75 | 				mov		dword ptr [esi + 12], edx | 
| 76 | 				pop		esi | 
| 77 | 				pop		ebx | 
| 78 | 				ret | 
| 79 | 			} | 
| 80 | 		} | 
| 81 | 	#else | 
| 82 | 		#include <intrin.h> // for __cpuid | 
| 83 | 	#endif | 
| 84 | #else | 
| 85 | 	#ifndef __GNUC_PREREQ | 
| 86 | 	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) | 
| 87 | 	#endif | 
| 88 | 	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__) | 
| 89 | 		#include <cpuid.h> | 
| 90 | 	#else | 
| 91 | 		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm' | 
| 92 | 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) | 
| 93 | 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) | 
| 94 | 		#else | 
| 95 | 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) | 
| 96 | 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) | 
| 97 | 		#endif | 
| 98 | 	#endif | 
| 99 | #endif | 
| 100 | #endif | 
| 101 |  | 
| 102 | namespace Xbyak { namespace util { | 
| 103 |  | 
| 104 | typedef enum { | 
| 105 |    SmtLevel = 1, | 
| 106 |    CoreLevel = 2 | 
| 107 | } IntelCpuTopologyLevel; | 
| 108 |  | 
| 109 | /** | 
| 110 | 	CPU detection class | 
| 111 | */ | 
| 112 | class Cpu { | 
| 113 | 	uint64 type_; | 
| 114 | 	//system topology | 
| 115 | 	bool x2APIC_supported_; | 
| 116 | 	static const size_t maxTopologyLevels = 2; | 
| 117 | 	unsigned int numCores_[maxTopologyLevels]; | 
| 118 |  | 
| 119 | 	static const unsigned int maxNumberCacheLevels = 10; | 
| 120 | 	unsigned int dataCacheSize_[maxNumberCacheLevels]; | 
| 121 | 	unsigned int coresSharignDataCache_[maxNumberCacheLevels]; | 
| 122 | 	unsigned int dataCacheLevels_; | 
| 123 |  | 
| 124 | 	unsigned int get32bitAsBE(const char *x) const | 
| 125 | 	{ | 
| 126 | 		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); | 
| 127 | 	} | 
| 128 | 	unsigned int mask(int n) const | 
| 129 | 	{ | 
| 130 | 		return (1U << n) - 1; | 
| 131 | 	} | 
| 132 | 	void setFamily() | 
| 133 | 	{ | 
| 134 | 		unsigned int data[4] = {}; | 
| 135 | 		getCpuid(1, data); | 
| 136 | 		stepping = data[0] & mask(4); | 
| 137 | 		model = (data[0] >> 4) & mask(4); | 
| 138 | 		family = (data[0] >> 8) & mask(4); | 
| 139 | 		// type = (data[0] >> 12) & mask(2); | 
| 140 | 		extModel = (data[0] >> 16) & mask(4); | 
| 141 | 		extFamily = (data[0] >> 20) & mask(8); | 
| 142 | 		if (family == 0x0f) { | 
| 143 | 			displayFamily = family + extFamily; | 
| 144 | 		} else { | 
| 145 | 			displayFamily = family; | 
| 146 | 		} | 
| 147 | 		if (family == 6 || family == 0x0f) { | 
| 148 | 			displayModel = (extModel << 4) + model; | 
| 149 | 		} else { | 
| 150 | 			displayModel = model; | 
| 151 | 		} | 
| 152 | 	} | 
| 153 | 	unsigned int (unsigned int val, unsigned int base, unsigned int end) | 
| 154 | 	{ | 
| 155 | 		return (val >> base) & ((1u << (end - base)) - 1); | 
| 156 | 	} | 
| 157 | 	void setNumCores() | 
| 158 | 	{ | 
| 159 | 		if ((type_ & tINTEL) == 0) return; | 
| 160 |  | 
| 161 | 		unsigned int data[4] = {}; | 
| 162 |  | 
| 163 | 		 /* CAUTION: These numbers are configuration as shipped by Intel. */ | 
| 164 | 		getCpuidEx(0x0, 0, data); | 
| 165 | 		if (data[0] >= 0xB) { | 
| 166 | 			 /* | 
| 167 | 				if leaf 11 exists(x2APIC is supported), | 
| 168 | 				we use it to get the number of smt cores and cores on socket | 
| 169 |  | 
| 170 | 				leaf 0xB can be zeroed-out by a hypervisor | 
| 171 | 			*/ | 
| 172 | 			x2APIC_supported_ = true; | 
| 173 | 			for (unsigned int i = 0; i < maxTopologyLevels; i++) { | 
| 174 | 				getCpuidEx(0xB, i, data); | 
| 175 | 				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); | 
| 176 | 				if (level == SmtLevel || level == CoreLevel) { | 
| 177 | 					numCores_[level - 1] = extractBit(data[1], 0, 15); | 
| 178 | 				} | 
| 179 | 			} | 
| 180 | 		} else { | 
| 181 | 			/* | 
| 182 | 				Failed to deremine num of cores without x2APIC support. | 
| 183 | 				TODO: USE initial APIC ID to determine ncores. | 
| 184 | 			*/ | 
| 185 | 			numCores_[SmtLevel - 1] = 0; | 
| 186 | 			numCores_[CoreLevel - 1] = 0; | 
| 187 | 		} | 
| 188 |  | 
| 189 | 	} | 
| 190 | 	void setCacheHierarchy() | 
| 191 | 	{ | 
| 192 | 		if ((type_ & tINTEL) == 0) return; | 
| 193 | 		const unsigned int NO_CACHE = 0; | 
| 194 | 		const unsigned int DATA_CACHE = 1; | 
| 195 | //		const unsigned int INSTRUCTION_CACHE = 2; | 
| 196 | 		const unsigned int UNIFIED_CACHE = 3; | 
| 197 | 		unsigned int smt_width = 0; | 
| 198 | 		unsigned int logical_cores = 0; | 
| 199 | 		unsigned int data[4] = {}; | 
| 200 |  | 
| 201 | 		if (x2APIC_supported_) { | 
| 202 | 			smt_width = numCores_[0]; | 
| 203 | 			logical_cores = numCores_[1]; | 
| 204 | 		} | 
| 205 |  | 
| 206 | 		/* | 
| 207 | 			Assumptions: | 
| 208 | 			the first level of data cache is not shared (which is the | 
| 209 | 			case for every existing architecture) and use this to | 
| 210 | 			determine the SMT width for arch not supporting leaf 11. | 
| 211 | 			when leaf 4 reports a number of core less than numCores_ | 
| 212 | 			on socket reported by leaf 11, then it is a correct number | 
| 213 | 			of cores not an upperbound. | 
| 214 | 		*/ | 
| 215 | 		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { | 
| 216 | 			getCpuidEx(0x4, i, data); | 
| 217 | 			unsigned int cacheType = extractBit(data[0], 0, 4); | 
| 218 | 			if (cacheType == NO_CACHE) break; | 
| 219 | 			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { | 
| 220 | 				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; | 
| 221 | 				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid | 
| 222 | 					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); | 
| 223 | 				} | 
| 224 | 				assert(actual_logical_cores != 0); | 
| 225 | 				dataCacheSize_[dataCacheLevels_] = | 
| 226 | 					(extractBit(data[1], 22, 31) + 1) | 
| 227 | 					* (extractBit(data[1], 12, 21) + 1) | 
| 228 | 					* (extractBit(data[1], 0, 11) + 1) | 
| 229 | 					* (data[2] + 1); | 
| 230 | 				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; | 
| 231 | 				assert(smt_width != 0); | 
| 232 | 				// FIXME: check and fix number of cores sharing L3 cache for different configurations | 
| 233 | 				// (HT-, 2 sockets), (HT-, 1 socket), (HT+, 2 sockets), (HT+, 1 socket) | 
| 234 | 				coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); | 
| 235 | 				dataCacheLevels_++; | 
| 236 | 			} | 
| 237 | 		} | 
| 238 | 	} | 
| 239 |  | 
| 240 | public: | 
| 241 | 	int model; | 
| 242 | 	int family; | 
| 243 | 	int stepping; | 
| 244 | 	int extModel; | 
| 245 | 	int extFamily; | 
| 246 | 	int displayFamily; // family + extFamily | 
| 247 | 	int displayModel; // model + extModel | 
| 248 |  | 
| 249 | 	unsigned int getNumCores(IntelCpuTopologyLevel level) { | 
| 250 | 		if (level != SmtLevel && level != CoreLevel) throw Error(ERR_BAD_PARAMETER); | 
| 251 | 		if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); | 
| 252 | 		return (level == CoreLevel) | 
| 253 | 			? numCores_[level - 1] / numCores_[SmtLevel - 1] | 
| 254 | 			: numCores_[level - 1]; | 
| 255 | 	} | 
| 256 |  | 
| 257 | 	unsigned int getDataCacheLevels() const { return dataCacheLevels_; } | 
| 258 | 	unsigned int getCoresSharingDataCache(unsigned int i) const | 
| 259 | 	{ | 
| 260 | 		if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER); | 
| 261 | 		return coresSharignDataCache_[i]; | 
| 262 | 	} | 
| 263 | 	unsigned int getDataCacheSize(unsigned int i) const | 
| 264 | 	{ | 
| 265 | 		if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER); | 
| 266 | 		return dataCacheSize_[i]; | 
| 267 | 	} | 
| 268 |  | 
| 269 | 	/* | 
| 270 | 		data[] = { eax, ebx, ecx, edx } | 
| 271 | 	*/ | 
| 272 | 	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) | 
| 273 | 	{ | 
| 274 | #ifdef XBYAK_INTEL_CPU_SPECIFIC | 
| 275 | 	#ifdef _MSC_VER | 
| 276 | 		__cpuid(reinterpret_cast<int*>(data), eaxIn); | 
| 277 | 	#else | 
| 278 | 		__cpuid(eaxIn, data[0], data[1], data[2], data[3]); | 
| 279 | 	#endif | 
| 280 | #else | 
| 281 | 		(void)eaxIn; | 
| 282 | 		(void)data; | 
| 283 | #endif | 
| 284 | 	} | 
| 285 | 	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) | 
| 286 | 	{ | 
| 287 | #ifdef XBYAK_INTEL_CPU_SPECIFIC | 
| 288 | 	#ifdef _MSC_VER | 
| 289 | 		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); | 
| 290 | 	#else | 
| 291 | 		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); | 
| 292 | 	#endif | 
| 293 | #else | 
| 294 | 		(void)eaxIn; | 
| 295 | 		(void)ecxIn; | 
| 296 | 		(void)data; | 
| 297 | #endif | 
| 298 | 	} | 
| 299 | 	static inline uint64 getXfeature() | 
| 300 | 	{ | 
| 301 | #ifdef XBYAK_INTEL_CPU_SPECIFIC | 
| 302 | 	#ifdef _MSC_VER | 
| 303 | 		return _xgetbv(0); | 
| 304 | 	#else | 
| 305 | 		unsigned int eax, edx; | 
| 306 | 		// xgetvb is not support on gcc 4.2 | 
| 307 | //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); | 
| 308 | 		__asm__ volatile(".byte 0x0f, 0x01, 0xd0"  : "=a" (eax), "=d" (edx) : "c" (0)); | 
| 309 | 		return ((uint64)edx << 32) | eax; | 
| 310 | 	#endif | 
| 311 | #else | 
| 312 | 		return 0; | 
| 313 | #endif | 
| 314 | 	} | 
| 315 | 	typedef uint64 Type; | 
| 316 |  | 
| 317 | 	static const Type NONE = 0; | 
| 318 | 	static const Type tMMX = 1 << 0; | 
| 319 | 	static const Type tMMX2 = 1 << 1; | 
| 320 | 	static const Type tCMOV = 1 << 2; | 
| 321 | 	static const Type tSSE = 1 << 3; | 
| 322 | 	static const Type tSSE2 = 1 << 4; | 
| 323 | 	static const Type tSSE3 = 1 << 5; | 
| 324 | 	static const Type tSSSE3 = 1 << 6; | 
| 325 | 	static const Type tSSE41 = 1 << 7; | 
| 326 | 	static const Type tSSE42 = 1 << 8; | 
| 327 | 	static const Type tPOPCNT = 1 << 9; | 
| 328 | 	static const Type tAESNI = 1 << 10; | 
| 329 | 	static const Type tSSE5 = 1 << 11; | 
| 330 | 	static const Type tOSXSAVE = 1 << 12; | 
| 331 | 	static const Type tPCLMULQDQ = 1 << 13; | 
| 332 | 	static const Type tAVX = 1 << 14; | 
| 333 | 	static const Type tFMA = 1 << 15; | 
| 334 |  | 
| 335 | 	static const Type t3DN = 1 << 16; | 
| 336 | 	static const Type tE3DN = 1 << 17; | 
| 337 | 	static const Type tSSE4a = 1 << 18; | 
| 338 | 	static const Type tRDTSCP = 1 << 19; | 
| 339 | 	static const Type tAVX2 = 1 << 20; | 
| 340 | 	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt | 
| 341 | 	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx | 
| 342 | 	static const Type tLZCNT = 1 << 23; | 
| 343 |  | 
| 344 | 	static const Type tINTEL = 1 << 24; | 
| 345 | 	static const Type tAMD = 1 << 25; | 
| 346 |  | 
| 347 | 	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb | 
| 348 | 	static const Type tRDRAND = 1 << 27; | 
| 349 | 	static const Type tADX = 1 << 28; // adcx, adox | 
| 350 | 	static const Type tRDSEED = 1 << 29; // rdseed | 
| 351 | 	static const Type tSMAP = 1 << 30; // stac | 
| 352 | 	static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest | 
| 353 | 	static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort | 
| 354 | 	static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph | 
| 355 | 	static const Type tMOVBE = uint64(1) << 34; // mobve | 
| 356 | 	static const Type tAVX512F = uint64(1) << 35; | 
| 357 | 	static const Type tAVX512DQ = uint64(1) << 36; | 
| 358 | 	static const Type tAVX512_IFMA = uint64(1) << 37; | 
| 359 | 	static const Type tAVX512IFMA = tAVX512_IFMA; | 
| 360 | 	static const Type tAVX512PF = uint64(1) << 38; | 
| 361 | 	static const Type tAVX512ER = uint64(1) << 39; | 
| 362 | 	static const Type tAVX512CD = uint64(1) << 40; | 
| 363 | 	static const Type tAVX512BW = uint64(1) << 41; | 
| 364 | 	static const Type tAVX512VL = uint64(1) << 42; | 
| 365 | 	static const Type tAVX512_VBMI = uint64(1) << 43; | 
| 366 | 	static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual | 
| 367 | 	static const Type tAVX512_4VNNIW = uint64(1) << 44; | 
| 368 | 	static const Type tAVX512_4FMAPS = uint64(1) << 45; | 
| 369 | 	static const Type tPREFETCHWT1 = uint64(1) << 46; | 
| 370 | 	static const Type tPREFETCHW = uint64(1) << 47; | 
| 371 | 	static const Type tSHA = uint64(1) << 48; | 
| 372 | 	static const Type tMPX = uint64(1) << 49; | 
| 373 | 	static const Type tAVX512_VBMI2 = uint64(1) << 50; | 
| 374 | 	static const Type tGFNI = uint64(1) << 51; | 
| 375 | 	static const Type tVAES = uint64(1) << 52; | 
| 376 | 	static const Type tVPCLMULQDQ = uint64(1) << 53; | 
| 377 | 	static const Type tAVX512_VNNI = uint64(1) << 54; | 
| 378 | 	static const Type tAVX512_BITALG = uint64(1) << 55; | 
| 379 | 	static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56; | 
| 380 |  | 
| 381 | 	Cpu() | 
| 382 | 		: type_(NONE) | 
| 383 | 		, x2APIC_supported_(false) | 
| 384 | 		, numCores_() | 
| 385 | 		, dataCacheSize_() | 
| 386 | 		, coresSharignDataCache_() | 
| 387 | 		, dataCacheLevels_(0) | 
| 388 | 	{ | 
| 389 | 		unsigned int data[4] = {}; | 
| 390 | 		const unsigned int& EAX = data[0]; | 
| 391 | 		const unsigned int& EBX = data[1]; | 
| 392 | 		const unsigned int& ECX = data[2]; | 
| 393 | 		const unsigned int& EDX = data[3]; | 
| 394 | 		getCpuid(0, data); | 
| 395 | 		const unsigned int maxNum = EAX; | 
| 396 | 		static const char intel[] = "ntel" ; | 
| 397 | 		static const char amd[] = "cAMD" ; | 
| 398 | 		if (ECX == get32bitAsBE(amd)) { | 
| 399 | 			type_ |= tAMD; | 
| 400 | 			getCpuid(0x80000001, data); | 
| 401 | 			if (EDX & (1U << 31)) type_ |= t3DN; | 
| 402 | 			if (EDX & (1U << 15)) type_ |= tCMOV; | 
| 403 | 			if (EDX & (1U << 30)) type_ |= tE3DN; | 
| 404 | 			if (EDX & (1U << 22)) type_ |= tMMX2; | 
| 405 | 			if (EDX & (1U << 27)) type_ |= tRDTSCP; | 
| 406 | 		} | 
| 407 | 		if (ECX == get32bitAsBE(intel)) { | 
| 408 | 			type_ |= tINTEL; | 
| 409 | 			getCpuid(0x80000001, data); | 
| 410 | 			if (EDX & (1U << 27)) type_ |= tRDTSCP; | 
| 411 | 			if (ECX & (1U << 5)) type_ |= tLZCNT; | 
| 412 | 			if (ECX & (1U << 8)) type_ |= tPREFETCHW; | 
| 413 | 		} | 
| 414 | 		getCpuid(1, data); | 
| 415 | 		if (ECX & (1U << 0)) type_ |= tSSE3; | 
| 416 | 		if (ECX & (1U << 9)) type_ |= tSSSE3; | 
| 417 | 		if (ECX & (1U << 19)) type_ |= tSSE41; | 
| 418 | 		if (ECX & (1U << 20)) type_ |= tSSE42; | 
| 419 | 		if (ECX & (1U << 22)) type_ |= tMOVBE; | 
| 420 | 		if (ECX & (1U << 23)) type_ |= tPOPCNT; | 
| 421 | 		if (ECX & (1U << 25)) type_ |= tAESNI; | 
| 422 | 		if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; | 
| 423 | 		if (ECX & (1U << 27)) type_ |= tOSXSAVE; | 
| 424 | 		if (ECX & (1U << 30)) type_ |= tRDRAND; | 
| 425 | 		if (ECX & (1U << 29)) type_ |= tF16C; | 
| 426 |  | 
| 427 | 		if (EDX & (1U << 15)) type_ |= tCMOV; | 
| 428 | 		if (EDX & (1U << 23)) type_ |= tMMX; | 
| 429 | 		if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; | 
| 430 | 		if (EDX & (1U << 26)) type_ |= tSSE2; | 
| 431 |  | 
| 432 | 		if (type_ & tOSXSAVE) { | 
| 433 | 			// check XFEATURE_ENABLED_MASK[2:1] = '11b' | 
| 434 | 			uint64 bv = getXfeature(); | 
| 435 | 			if ((bv & 6) == 6) { | 
| 436 | 				if (ECX & (1U << 28)) type_ |= tAVX; | 
| 437 | 				if (ECX & (1U << 12)) type_ |= tFMA; | 
| 438 | 				if (((bv >> 5) & 7) == 7) { | 
| 439 | 					getCpuidEx(7, 0, data); | 
| 440 | 					if (EBX & (1U << 16)) type_ |= tAVX512F; | 
| 441 | 					if (type_ & tAVX512F) { | 
| 442 | 						if (EBX & (1U << 17)) type_ |= tAVX512DQ; | 
| 443 | 						if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; | 
| 444 | 						if (EBX & (1U << 26)) type_ |= tAVX512PF; | 
| 445 | 						if (EBX & (1U << 27)) type_ |= tAVX512ER; | 
| 446 | 						if (EBX & (1U << 28)) type_ |= tAVX512CD; | 
| 447 | 						if (EBX & (1U << 30)) type_ |= tAVX512BW; | 
| 448 | 						if (EBX & (1U << 31)) type_ |= tAVX512VL; | 
| 449 | 						if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; | 
| 450 | 						if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; | 
| 451 | 						if (ECX & (1U << 8)) type_ |= tGFNI; | 
| 452 | 						if (ECX & (1U << 9)) type_ |= tVAES; | 
| 453 | 						if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; | 
| 454 | 						if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; | 
| 455 | 						if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; | 
| 456 | 						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; | 
| 457 | 						if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; | 
| 458 | 						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; | 
| 459 | 					} | 
| 460 | 				} | 
| 461 | 			} | 
| 462 | 		} | 
| 463 | 		if (maxNum >= 7) { | 
| 464 | 			getCpuidEx(7, 0, data); | 
| 465 | 			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; | 
| 466 | 			if (EBX & (1U << 3)) type_ |= tBMI1; | 
| 467 | 			if (EBX & (1U << 8)) type_ |= tBMI2; | 
| 468 | 			if (EBX & (1U << 9)) type_ |= tENHANCED_REP; | 
| 469 | 			if (EBX & (1U << 18)) type_ |= tRDSEED; | 
| 470 | 			if (EBX & (1U << 19)) type_ |= tADX; | 
| 471 | 			if (EBX & (1U << 20)) type_ |= tSMAP; | 
| 472 | 			if (EBX & (1U << 4)) type_ |= tHLE; | 
| 473 | 			if (EBX & (1U << 11)) type_ |= tRTM; | 
| 474 | 			if (EBX & (1U << 14)) type_ |= tMPX; | 
| 475 | 			if (EBX & (1U << 29)) type_ |= tSHA; | 
| 476 | 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; | 
| 477 | 		} | 
| 478 | 		setFamily(); | 
| 479 | 		setNumCores(); | 
| 480 | 		setCacheHierarchy(); | 
| 481 | 	} | 
| 482 | 	void putFamily() const | 
| 483 | 	{ | 
| 484 | 		printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n" , | 
| 485 | 			family, model, stepping, extFamily, extModel); | 
| 486 | 		printf("display:family=%X, model=%X\n" , displayFamily, displayModel); | 
| 487 | 	} | 
| 488 | 	bool has(Type type) const | 
| 489 | 	{ | 
| 490 | 		return (type & type_) != 0; | 
| 491 | 	} | 
| 492 | }; | 
| 493 |  | 
| 494 | class Clock { | 
| 495 | public: | 
| 496 | 	static inline uint64 getRdtsc() | 
| 497 | 	{ | 
| 498 | #ifdef XBYAK_INTEL_CPU_SPECIFIC | 
| 499 | 	#ifdef _MSC_VER | 
| 500 | 		return __rdtsc(); | 
| 501 | 	#else | 
| 502 | 		unsigned int eax, edx; | 
| 503 | 		__asm__ volatile("rdtsc"  : "=a" (eax), "=d" (edx)); | 
| 504 | 		return ((uint64)edx << 32) | eax; | 
| 505 | 	#endif | 
| 506 | #else | 
| 507 | 		// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu | 
| 508 | 		return 0; | 
| 509 | #endif | 
| 510 | 	} | 
| 511 | 	Clock() | 
| 512 | 		: clock_(0) | 
| 513 | 		, count_(0) | 
| 514 | 	{ | 
| 515 | 	} | 
| 516 | 	void begin() | 
| 517 | 	{ | 
| 518 | 		clock_ -= getRdtsc(); | 
| 519 | 	} | 
| 520 | 	void end() | 
| 521 | 	{ | 
| 522 | 		clock_ += getRdtsc(); | 
| 523 | 		count_++; | 
| 524 | 	} | 
| 525 | 	int getCount() const { return count_; } | 
| 526 | 	uint64 getClock() const { return clock_; } | 
| 527 | 	void clear() { count_ = 0; clock_ = 0; } | 
| 528 | private: | 
| 529 | 	uint64 clock_; | 
| 530 | 	int count_; | 
| 531 | }; | 
| 532 |  | 
| 533 | #ifdef XBYAK64 | 
| 534 | const int UseRCX = 1 << 6; | 
| 535 | const int UseRDX = 1 << 7; | 
| 536 |  | 
| 537 | class Pack { | 
| 538 | 	static const size_t maxTblNum = 15; | 
| 539 | 	const Xbyak::Reg64 *tbl_[maxTblNum]; | 
| 540 | 	size_t n_; | 
| 541 | public: | 
| 542 | 	Pack() : tbl_(), n_(0) {} | 
| 543 | 	Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); } | 
| 544 | 	Pack(const Pack& rhs) | 
| 545 | 		: n_(rhs.n_) | 
| 546 | 	{ | 
| 547 | 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; | 
| 548 | 	} | 
| 549 | 	Pack& operator=(const Pack& rhs) | 
| 550 | 	{ | 
| 551 | 		n_ = rhs.n_; | 
| 552 | 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; | 
| 553 | 		return *this; | 
| 554 | 	} | 
| 555 | 	Pack(const Xbyak::Reg64& t0) | 
| 556 | 	{ n_ = 1; tbl_[0] = &t0; } | 
| 557 | 	Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 558 | 	{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; } | 
| 559 | 	Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 560 | 	{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; } | 
| 561 | 	Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 562 | 	{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; } | 
| 563 | 	Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 564 | 	{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; } | 
| 565 | 	Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 566 | 	{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; } | 
| 567 | 	Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 568 | 	{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; } | 
| 569 | 	Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 570 | 	{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; } | 
| 571 | 	Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 572 | 	{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; } | 
| 573 | 	Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) | 
| 574 | 	{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; } | 
| 575 | 	Pack& append(const Xbyak::Reg64& t) | 
| 576 | 	{ | 
| 577 | 		if (n_ == maxTblNum) { | 
| 578 | 			fprintf(stderr, "ERR Pack::can't append\n" ); | 
| 579 | 			throw Error(ERR_BAD_PARAMETER); | 
| 580 | 		} | 
| 581 | 		tbl_[n_++] = &t; | 
| 582 | 		return *this; | 
| 583 | 	} | 
| 584 | 	void init(const Xbyak::Reg64 *tbl, size_t n) | 
| 585 | 	{ | 
| 586 | 		if (n > maxTblNum) { | 
| 587 | 			fprintf(stderr, "ERR Pack::init bad n=%d\n" , (int)n); | 
| 588 | 			throw Error(ERR_BAD_PARAMETER); | 
| 589 | 		} | 
| 590 | 		n_ = n; | 
| 591 | 		for (size_t i = 0; i < n; i++) { | 
| 592 | 			tbl_[i] = &tbl[i]; | 
| 593 | 		} | 
| 594 | 	} | 
| 595 | 	const Xbyak::Reg64& operator[](size_t n) const | 
| 596 | 	{ | 
| 597 | 		if (n >= n_) { | 
| 598 | 			fprintf(stderr, "ERR Pack bad n=%d(%d)\n" , (int)n, (int)n_); | 
| 599 | 			throw Error(ERR_BAD_PARAMETER); | 
| 600 | 		} | 
| 601 | 		return *tbl_[n]; | 
| 602 | 	} | 
| 603 | 	size_t size() const { return n_; } | 
| 604 | 	/* | 
| 605 | 		get tbl[pos, pos + num) | 
| 606 | 	*/ | 
| 607 | 	Pack sub(size_t pos, size_t num = size_t(-1)) const | 
| 608 | 	{ | 
| 609 | 		if (num == size_t(-1)) num = n_ - pos; | 
| 610 | 		if (pos + num > n_) { | 
| 611 | 			fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n" , (int)pos, (int)num); | 
| 612 | 			throw Error(ERR_BAD_PARAMETER); | 
| 613 | 		} | 
| 614 | 		Pack pack; | 
| 615 | 		pack.n_ = num; | 
| 616 | 		for (size_t i = 0; i < num; i++) { | 
| 617 | 			pack.tbl_[i] = tbl_[pos + i]; | 
| 618 | 		} | 
| 619 | 		return pack; | 
| 620 | 	} | 
| 621 | 	void put() const | 
| 622 | 	{ | 
| 623 | 		for (size_t i = 0; i < n_; i++) { | 
| 624 | 			printf("%s " , tbl_[i]->toString()); | 
| 625 | 		} | 
| 626 | 		printf("\n" ); | 
| 627 | 	} | 
| 628 | }; | 
| 629 |  | 
| 630 | class StackFrame { | 
| 631 | #ifdef XBYAK64_WIN | 
| 632 | 	static const int noSaveNum = 6; | 
| 633 | 	static const int rcxPos = 0; | 
| 634 | 	static const int rdxPos = 1; | 
| 635 | #else | 
| 636 | 	static const int noSaveNum = 8; | 
| 637 | 	static const int rcxPos = 3; | 
| 638 | 	static const int rdxPos = 2; | 
| 639 | #endif | 
| 640 | 	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax | 
| 641 | 	Xbyak::CodeGenerator *code_; | 
| 642 | 	int pNum_; | 
| 643 | 	int tNum_; | 
| 644 | 	bool useRcx_; | 
| 645 | 	bool useRdx_; | 
| 646 | 	int saveNum_; | 
| 647 | 	int P_; | 
| 648 | 	bool makeEpilog_; | 
| 649 | 	Xbyak::Reg64 pTbl_[4]; | 
| 650 | 	Xbyak::Reg64 tTbl_[maxRegNum]; | 
| 651 | 	Pack p_; | 
| 652 | 	Pack t_; | 
| 653 | 	StackFrame(const StackFrame&); | 
| 654 | 	void operator=(const StackFrame&); | 
| 655 | public: | 
| 656 | 	const Pack& p; | 
| 657 | 	const Pack& t; | 
| 658 | 	/* | 
| 659 | 		make stack frame | 
| 660 | 		@param sf [in] this | 
| 661 | 		@param pNum [in] num of function parameter(0 <= pNum <= 4) | 
| 662 | 		@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 | 
| 663 | 		@param stackSizeByte [in] local stack size | 
| 664 | 		@param makeEpilog [in] automatically call close() if true | 
| 665 |  | 
| 666 | 		you can use | 
| 667 | 		rax | 
| 668 | 		gp0, ..., gp(pNum - 1) | 
| 669 | 		gt0, ..., gt(tNum-1) | 
| 670 | 		rcx if tNum & UseRCX | 
| 671 | 		rdx if tNum & UseRDX | 
| 672 | 		rsp[0..stackSizeByte - 1] | 
| 673 | 	*/ | 
| 674 | 	StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true) | 
| 675 | 		: code_(code) | 
| 676 | 		, pNum_(pNum) | 
| 677 | 		, tNum_(tNum & ~(UseRCX | UseRDX)) | 
| 678 | 		, useRcx_((tNum & UseRCX) != 0) | 
| 679 | 		, useRdx_((tNum & UseRDX) != 0) | 
| 680 | 		, saveNum_(0) | 
| 681 | 		, P_(0) | 
| 682 | 		, makeEpilog_(makeEpilog) | 
| 683 | 		, p(p_) | 
| 684 | 		, t(t_) | 
| 685 | 	{ | 
| 686 | 		using namespace Xbyak; | 
| 687 | 		if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM); | 
| 688 | 		const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); | 
| 689 | 		if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM); | 
| 690 | 		const Reg64& _rsp = code->rsp; | 
| 691 | 		saveNum_ = (std::max)(0, allRegNum - noSaveNum); | 
| 692 | 		const int *tbl = getOrderTbl() + noSaveNum; | 
| 693 | 		for (int i = 0; i < saveNum_; i++) { | 
| 694 | 			code->push(Reg64(tbl[i])); | 
| 695 | 		} | 
| 696 | 		P_ = (stackSizeByte + 7) / 8; | 
| 697 | 		if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment | 
| 698 | 		P_ *= 8; | 
| 699 | 		if (P_ > 0) code->sub(_rsp, P_); | 
| 700 | 		int pos = 0; | 
| 701 | 		for (int i = 0; i < pNum; i++) { | 
| 702 | 			pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); | 
| 703 | 		} | 
| 704 | 		for (int i = 0; i < tNum_; i++) { | 
| 705 | 			tTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); | 
| 706 | 		} | 
| 707 | 		if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx); | 
| 708 | 		if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx); | 
| 709 | 		p_.init(pTbl_, pNum); | 
| 710 | 		t_.init(tTbl_, tNum_); | 
| 711 | 	} | 
| 712 | 	/* | 
| 713 | 		make epilog manually | 
| 714 | 		@param callRet [in] call ret() if true | 
| 715 | 	*/ | 
| 716 | 	void close(bool callRet = true) | 
| 717 | 	{ | 
| 718 | 		using namespace Xbyak; | 
| 719 | 		const Reg64& _rsp = code_->rsp; | 
| 720 | 		const int *tbl = getOrderTbl() + noSaveNum; | 
| 721 | 		if (P_ > 0) code_->add(_rsp, P_); | 
| 722 | 		for (int i = 0; i < saveNum_; i++) { | 
| 723 | 			code_->pop(Reg64(tbl[saveNum_ - 1 - i])); | 
| 724 | 		} | 
| 725 |  | 
| 726 | 		if (callRet) code_->ret(); | 
| 727 | 	} | 
| 728 | 	~StackFrame() | 
| 729 | 	{ | 
| 730 | 		if (!makeEpilog_) return; | 
| 731 | 		try { | 
| 732 | 			close(); | 
| 733 | 		} catch (std::exception& e) { | 
| 734 | 			printf("ERR:StackFrame %s\n" , e.what()); | 
| 735 | 			//exit(1); | 
| 736 | 		} | 
| 737 | 	} | 
| 738 | private: | 
| 739 | 	const int *getOrderTbl() const | 
| 740 | 	{ | 
| 741 | 		using namespace Xbyak; | 
| 742 | 		static const int tbl[] = { | 
| 743 | #ifdef XBYAK64_WIN | 
| 744 | 			Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI, | 
| 745 | #else | 
| 746 | 			Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, | 
| 747 | #endif | 
| 748 | 			Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15 | 
| 749 | 		}; | 
| 750 | 		return &tbl[0]; | 
| 751 | 	} | 
| 752 | 	int getRegIdx(int& pos) const | 
| 753 | 	{ | 
| 754 | 		assert(pos < maxRegNum); | 
| 755 | 		using namespace Xbyak; | 
| 756 | 		const int *tbl = getOrderTbl(); | 
| 757 | 		int r = tbl[pos++]; | 
| 758 | 		if (useRcx_) { | 
| 759 | 			if (r == Operand::RCX) { return Operand::R10; } | 
| 760 | 			if (r == Operand::R10) { r = tbl[pos++]; } | 
| 761 | 		} | 
| 762 | 		if (useRdx_) { | 
| 763 | 			if (r == Operand::RDX) { return Operand::R11; } | 
| 764 | 			if (r == Operand::R11) { return tbl[pos++]; } | 
| 765 | 		} | 
| 766 | 		return r; | 
| 767 | 	} | 
| 768 | }; | 
| 769 | #endif | 
| 770 |  | 
| 771 | } } // end of util | 
| 772 | #endif | 
| 773 |  |