1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4// CGENCPU.H -
5//
6// Various helper routines for generating AMD64 assembly code.
7//
8// DO NOT INCLUDE THIS FILE DIRECTLY - ALWAYS USE CGENSYS.H INSTEAD
9//
10
11
12
13#ifndef _TARGET_AMD64_
14#error Should only include "AMD64\cgencpu.h" for AMD64 builds
15#endif
16
17#ifndef __cgencpu_h__
18#define __cgencpu_h__
19
20#include "xmmintrin.h"
21
22// Given a return address retrieved during stackwalk,
23// this is the offset by which it should be decremented to lend somewhere in a call instruction.
24#define STACKWALK_CONTROLPC_ADJUST_OFFSET 1
25
26// preferred alignment for data
27#define DATA_ALIGNMENT 8
28
29class MethodDesc;
30class FramedMethodFrame;
31class Module;
32struct VASigCookie;
33class ComCallMethodDesc;
34
35//
36// functions implemented in AMD64 assembly
37//
38EXTERN_C void InstantiatingMethodStubWorker(void);
39EXTERN_C void SinglecastDelegateInvokeStub();
40EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr);
41
42#define COMMETHOD_PREPAD 16 // # extra bytes to allocate in addition to sizeof(ComCallMethodDesc)
43#define COMMETHOD_CALL_PRESTUB_SIZE 6 // 32-bit indirect relative call
44#define COMMETHOD_CALL_PRESTUB_ADDRESS_OFFSET -10 // the offset of the call target address inside the prestub
45
46#define STACK_ALIGN_SIZE 16
47
48#define JUMP_ALLOCATE_SIZE 12 // # bytes to allocate for a 64-bit jump instruction
49#define BACK_TO_BACK_JUMP_ALLOCATE_SIZE 12 // # bytes to allocate for a back to back 64-bit jump instruction
50#define SIZEOF_LOAD_AND_JUMP_THUNK 22 // # bytes to mov r10, X; jmp Z
51#define SIZEOF_LOAD2_AND_JUMP_THUNK 32 // # bytes to mov r10, X; mov r11, Y; jmp Z
52
53// Also in Zapper.h, CorCompile.h, FnTableAccess.h
54#define USE_INDIRECT_CODEHEADER // use CodeHeader, RealCodeHeader construct
55
56#define HAS_NDIRECT_IMPORT_PRECODE 1
57#define HAS_FIXUP_PRECODE 1
58#define HAS_FIXUP_PRECODE_CHUNKS 1
59#define FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS 1
60
61// ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer
62#define HAS_THISPTR_RETBUF_PRECODE 1
63
64#define CODE_SIZE_ALIGN 16 // must alloc code blocks on 8-byte boundaries; for perf reasons we use 16 byte boundaries
65#define CACHE_LINE_SIZE 64 // Current AMD64 processors have 64-byte cache lines as per AMD64 optmization manual
66#define LOG2SLOT LOG2_PTRSIZE
67
68
69#ifdef UNIX_AMD64_ABI
70#define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 16 // bytes
71#define ENREGISTERED_PARAMTYPE_MAXSIZE 16 // bytes
72#define ENREGISTERED_RETURNTYPE_MAXSIZE 16 // bytes
73#define CALLDESCR_ARGREGS 1 // CallDescrWorker has ArgumentRegister parameter
74#define CALLDESCR_FPARGREGS 1 // CallDescrWorker has FloatArgumentRegisters parameter
75#else
76#define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 8 // bytes
77#define ENREGISTERED_PARAMTYPE_MAXSIZE 8 // bytes
78#define ENREGISTERED_RETURNTYPE_MAXSIZE 8 // bytes
79#define COM_STUBS_SEPARATE_FP_LOCATIONS
80#define CALLDESCR_REGTYPEMAP 1
81#endif
82
83#define INSTRFMT_K64SMALL
84#define INSTRFMT_K64
85
86#ifndef FEATURE_PAL
87#define USE_REDIRECT_FOR_GCSTRESS
88#endif // FEATURE_PAL
89
90//
91// REX prefix byte
92//
93#define REX_PREFIX_BASE 0x40 // 0100xxxx
94#define REX_OPERAND_SIZE_64BIT 0x08 // xxxx1xxx
95#define REX_MODRM_REG_EXT 0x04 // xxxxx1xx // use for 'middle' 3 bit field of mod/r/m
96#define REX_SIB_INDEX_EXT 0x02 // xxxxxx10
97#define REX_MODRM_RM_EXT 0x01 // XXXXXXX1 // use for low 3 bit field of mod/r/m
98#define REX_SIB_BASE_EXT 0x01 // XXXXXXX1
99#define REX_OPCODE_REG_EXT 0x01 // XXXXXXX1
100
101#define X86_REGISTER_MASK 0x7
102
103#define X86RegFromAMD64Reg(extended_reg) \
104 ((X86Reg)(((int)extended_reg) & X86_REGISTER_MASK))
105
106
107//=======================================================================
108// IMPORTANT: This value is used to figure out how much to allocate
109// for a fixed array of FieldMarshaler's. That means it must be at least
110// as large as the largest FieldMarshaler subclass. This requirement
111// is guarded by an assert.
112//=======================================================================
113#define MAXFIELDMARSHALERSIZE 40
114
115
116// Why is the return value ARG_SLOT? On 64-bit systems, that is 64-bits
117// and much bigger than necessary for R4, requiring explicit downcasts.
118inline
119ARG_SLOT FPSpillToR4(void* pSpillSlot)
120{
121 LIMITED_METHOD_CONTRACT;
122 return *(DWORD*)pSpillSlot;
123}
124
125inline
126ARG_SLOT FPSpillToR8(void* pSpillSlot)
127{
128 LIMITED_METHOD_CONTRACT;
129 return *(SIZE_T*)pSpillSlot;
130}
131
132inline
133void R4ToFPSpill(void* pSpillSlot, DWORD srcFloatAsDWORD)
134{
135 LIMITED_METHOD_CONTRACT;
136 *(SIZE_T*)pSpillSlot = (SIZE_T)srcFloatAsDWORD;
137 *((SIZE_T*)pSpillSlot + 1) = 0;
138}
139
140inline
141void R8ToFPSpill(void* pSpillSlot, SIZE_T srcDoubleAsSIZE_T)
142{
143 LIMITED_METHOD_CONTRACT;
144 *(SIZE_T*)pSpillSlot = srcDoubleAsSIZE_T;
145 *((SIZE_T*)pSpillSlot + 1) = 0;
146}
147
148
149#ifdef CROSSGEN_COMPILE
150#define GetEEFuncEntryPoint(pfn) 0x1001
151#else
152#define GetEEFuncEntryPoint(pfn) GFN_TADDR(pfn)
153#endif
154
155
156//**********************************************************************
157// Parameter size
158//**********************************************************************
159
160typedef INT64 StackElemType;
161#define STACK_ELEM_SIZE sizeof(StackElemType)
162
163// !! This expression assumes STACK_ELEM_SIZE is a power of 2.
164#define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
165
166//**********************************************************************
167// Frames
168//**********************************************************************
169//--------------------------------------------------------------------
170// This represents some of the TransitionFrame fields that are
171// stored at negative offsets.
172//--------------------------------------------------------------------
173struct REGDISPLAY;
174
175//--------------------------------------------------------------------
176// This represents the arguments that are stored in volatile registers.
177// This should not overlap the CalleeSavedRegisters since those are already
178// saved separately and it would be wasteful to save the same register twice.
179// If we do use a non-volatile register as an argument, then the ArgIterator
180// will probably have to communicate this back to the PromoteCallerStack
181// routine to avoid a double promotion.
182//--------------------------------------------------------------------
183#ifdef UNIX_AMD64_ABI
184
185#define ENUM_ARGUMENT_REGISTERS() \
186 ARGUMENT_REGISTER(RDI) \
187 ARGUMENT_REGISTER(RSI) \
188 ARGUMENT_REGISTER(RDX) \
189 ARGUMENT_REGISTER(RCX) \
190 ARGUMENT_REGISTER(R8) \
191 ARGUMENT_REGISTER(R9)
192
193#define NUM_ARGUMENT_REGISTERS 6
194
195// The order of registers in this macro is hardcoded in assembly code
196// at number of places
197#define ENUM_CALLEE_SAVED_REGISTERS() \
198 CALLEE_SAVED_REGISTER(R12) \
199 CALLEE_SAVED_REGISTER(R13) \
200 CALLEE_SAVED_REGISTER(R14) \
201 CALLEE_SAVED_REGISTER(R15) \
202 CALLEE_SAVED_REGISTER(Rbx) \
203 CALLEE_SAVED_REGISTER(Rbp)
204
205#define NUM_CALLEE_SAVED_REGISTERS 6
206
207#else // UNIX_AMD64_ABI
208
209#define ENUM_ARGUMENT_REGISTERS() \
210 ARGUMENT_REGISTER(RCX) \
211 ARGUMENT_REGISTER(RDX) \
212 ARGUMENT_REGISTER(R8) \
213 ARGUMENT_REGISTER(R9)
214
215#define NUM_ARGUMENT_REGISTERS 4
216
217// The order of registers in this macro is hardcoded in assembly code
218// at number of places
219#define ENUM_CALLEE_SAVED_REGISTERS() \
220 CALLEE_SAVED_REGISTER(Rdi) \
221 CALLEE_SAVED_REGISTER(Rsi) \
222 CALLEE_SAVED_REGISTER(Rbx) \
223 CALLEE_SAVED_REGISTER(Rbp) \
224 CALLEE_SAVED_REGISTER(R12) \
225 CALLEE_SAVED_REGISTER(R13) \
226 CALLEE_SAVED_REGISTER(R14) \
227 CALLEE_SAVED_REGISTER(R15)
228
229#define NUM_CALLEE_SAVED_REGISTERS 8
230
231#endif // UNIX_AMD64_ABI
232
233typedef DPTR(struct ArgumentRegisters) PTR_ArgumentRegisters;
234struct ArgumentRegisters {
235 #define ARGUMENT_REGISTER(regname) INT_PTR regname;
236 ENUM_ARGUMENT_REGISTERS();
237 #undef ARGUMENT_REGISTER
238};
239
240typedef DPTR(struct CalleeSavedRegisters) PTR_CalleeSavedRegisters;
241struct CalleeSavedRegisters {
242 #define CALLEE_SAVED_REGISTER(regname) INT_PTR regname;
243 ENUM_CALLEE_SAVED_REGISTERS();
244 #undef CALLEE_SAVED_REGISTER
245};
246
247struct CalleeSavedRegistersPointers {
248 #define CALLEE_SAVED_REGISTER(regname) PTR_TADDR p##regname;
249 ENUM_CALLEE_SAVED_REGISTERS();
250 #undef CALLEE_SAVED_REGISTER
251};
252
253#define SCRATCH_REGISTER_X86REG kRAX
254
255#ifdef UNIX_AMD64_ABI
256#define THIS_REG RDI
257#define THIS_kREG kRDI
258
259#define ARGUMENT_kREG1 kRDI
260#define ARGUMENT_kREG2 kRSI
261#else
262#define THIS_REG RCX
263#define THIS_kREG kRCX
264
265#define ARGUMENT_kREG1 kRCX
266#define ARGUMENT_kREG2 kRDX
267#endif
268
269#ifdef UNIX_AMD64_ABI
270
271#define NUM_FLOAT_ARGUMENT_REGISTERS 8
272
273typedef DPTR(struct FloatArgumentRegisters) PTR_FloatArgumentRegisters;
274struct FloatArgumentRegisters {
275 M128A d[NUM_FLOAT_ARGUMENT_REGISTERS]; // xmm0-xmm7
276};
277
278#endif
279
280
281void UpdateRegDisplayFromCalleeSavedRegisters(REGDISPLAY * pRD, CalleeSavedRegisters * pRegs);
282
283
284// Sufficient context for Try/Catch restoration.
285struct EHContext {
286 // Not used
287};
288
289#define ARGUMENTREGISTERS_SIZE sizeof(ArgumentRegisters)
290
291
292#include "stublinkeramd64.h"
293
294
295
296//**********************************************************************
297// Exception handling
298//**********************************************************************
299
300inline PCODE GetIP(const CONTEXT * context)
301{
302 CONTRACTL
303 {
304 SO_TOLERANT;
305 NOTHROW;
306 GC_NOTRIGGER;
307 SUPPORTS_DAC;
308
309 PRECONDITION(CheckPointer(context));
310 }
311 CONTRACTL_END;
312
313 return PCODE(context->Rip);
314}
315
316inline void SetIP(CONTEXT* context, PCODE rip)
317{
318 CONTRACTL
319 {
320 SO_TOLERANT;
321 NOTHROW;
322 GC_NOTRIGGER;
323 SUPPORTS_DAC;
324
325 PRECONDITION(CheckPointer(context));
326 }
327 CONTRACTL_END;
328
329 context->Rip = (DWORD64) rip;
330}
331
332inline TADDR GetSP(const CONTEXT * context)
333{
334 CONTRACTL
335 {
336 SO_TOLERANT;
337 NOTHROW;
338 GC_NOTRIGGER;
339 SUPPORTS_DAC;
340
341 PRECONDITION(CheckPointer(context));
342 }
343 CONTRACTL_END;
344
345 return (TADDR)context->Rsp;
346}
347inline void SetSP(CONTEXT *context, TADDR rsp)
348{
349 CONTRACTL
350 {
351 SO_TOLERANT;
352 NOTHROW;
353 GC_NOTRIGGER;
354 SUPPORTS_DAC;
355
356 PRECONDITION(CheckPointer(context));
357 }
358 CONTRACTL_END;
359
360 context->Rsp = rsp;
361}
362
363#define SetFP(context, ebp)
364inline TADDR GetFP(const CONTEXT * context)
365{
366 LIMITED_METHOD_CONTRACT;
367
368 return (TADDR)(context->Rbp);
369}
370
371extern "C" TADDR GetCurrentSP();
372
373// Emits:
374// mov r10, pv1
375// mov rax, pTarget
376// jmp rax
377void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget);
378
379
380// Get Rel32 destination, emit jumpStub if necessary
381INT32 rel32UsingJumpStub(INT32 UNALIGNED * pRel32, PCODE target, MethodDesc *pMethod,
382 LoaderAllocator *pLoaderAllocator = NULL, bool throwOnOutOfMemoryWithinRange = true);
383
384// Get Rel32 destination, emit jumpStub if necessary into a preallocated location
385INT32 rel32UsingPreallocatedJumpStub(INT32 UNALIGNED * pRel32, PCODE target, PCODE jumpStubAddr, bool emitJump);
386
387void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target);
388
389void emitJump(LPBYTE pBuffer, LPVOID target);
390
391BOOL isJumpRel32(PCODE pCode);
392PCODE decodeJump32(PCODE pCode);
393
394BOOL isJumpRel64(PCODE pCode);
395PCODE decodeJump64(PCODE pCode);
396
397//
398// On IA64 back to back jumps should be separated by a nop bundle to get
399// the best performance from the hardware's branch prediction logic.
400// For all other platforms back to back jumps don't require anything special
401// That is why we have these two wrapper functions that call emitJump and decodeJump
402//
403inline void emitBackToBackJump(LPBYTE pBuffer, LPVOID target)
404{
405 WRAPPER_NO_CONTRACT;
406
407 emitJump(pBuffer, target);
408}
409
410inline BOOL isBackToBackJump(PCODE pCode)
411{
412 WRAPPER_NO_CONTRACT;
413 SUPPORTS_DAC;
414 return isJumpRel32(pCode) || isJumpRel64(pCode);
415}
416
417inline PCODE decodeBackToBackJump(PCODE pCode)
418{
419 WRAPPER_NO_CONTRACT;
420 SUPPORTS_DAC;
421 if (isJumpRel32(pCode))
422 return decodeJump32(pCode);
423 else
424 if (isJumpRel64(pCode))
425 return decodeJump64(pCode);
426 else
427 return NULL;
428}
429
430extern "C" void setFPReturn(int fpSize, INT64 retVal);
431extern "C" void getFPReturn(int fpSize, INT64 *retval);
432
433
434struct ComToManagedExRecord; // defined in cgencpu.cpp
435
436inline BOOL IsUnmanagedValueTypeReturnedByRef(UINT sizeofvaluetype)
437{
438 LIMITED_METHOD_CONTRACT;
439
440 if (sizeofvaluetype > ENREGISTERED_RETURNTYPE_MAXSIZE)
441 {
442 return TRUE;
443 }
444 else
445 {
446 return FALSE;
447 }
448}
449
450#include <pshpack1.h>
451struct DECLSPEC_ALIGN(8) UMEntryThunkCode
452{
453 // padding // CC CC CC CC
454 // mov r10, pUMEntryThunk // 49 ba xx xx xx xx xx xx xx xx // METHODDESC_REGISTER
455 // mov rax, pJmpDest // 48 b8 xx xx xx xx xx xx xx xx // need to ensure this imm64 is qword aligned
456 // TAILJMP_RAX // 48 FF E0
457
458 BYTE m_padding[4];
459 BYTE m_movR10[2]; // MOV R10,
460 LPVOID m_uet; // pointer to start of this structure
461 BYTE m_movRAX[2]; // MOV RAX,
462 DECLSPEC_ALIGN(8)
463 const BYTE* m_execstub; // pointer to destination code // ensure this is qword aligned
464 BYTE m_jmpRAX[3]; // JMP RAX
465 BYTE m_padding2[5];
466
467 void Encode(BYTE* pTargetCode, void* pvSecretParam);
468 void Poison();
469
470 LPCBYTE GetEntryPoint() const
471 {
472 LIMITED_METHOD_CONTRACT;
473
474 return (LPCBYTE)&m_movR10;
475 }
476
477 static int GetEntryPointOffset()
478 {
479 LIMITED_METHOD_CONTRACT;
480
481 return offsetof(UMEntryThunkCode, m_movR10);
482 }
483};
484#include <poppack.h>
485
486struct HijackArgs
487{
488#ifndef FEATURE_MULTIREG_RETURN
489 union
490 {
491 ULONG64 Rax;
492 ULONG64 ReturnValue[1];
493 };
494#else // FEATURE_MULTIREG_RETURN
495 union
496 {
497 struct
498 {
499 ULONG64 Rax;
500 ULONG64 Rdx;
501 };
502 ULONG64 ReturnValue[2];
503 };
504#endif // PLATFORM_UNIX
505 CalleeSavedRegisters Regs;
506 union
507 {
508 ULONG64 Rip;
509 size_t ReturnAddress;
510 };
511};
512
513#ifndef DACCESS_COMPILE
514
515DWORD GetOffsetAtEndOfFunction(ULONGLONG uImageBase,
516 PT_RUNTIME_FUNCTION pFunctionEntry,
517 int offsetNum = 1);
518
519#endif // DACCESS_COMPILE
520
521// ClrFlushInstructionCache is used when we want to call FlushInstructionCache
522// for a specific architecture in the common code, but not for other architectures.
523// We call ClrFlushInstructionCache whenever we create or modify code in the heap.
524// Currently ClrFlushInstructionCache has no effect on AMD64
525//
526
527inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode)
528{
529 // FlushInstructionCache(GetCurrentProcess(), pCodeAddr, sizeOfCode);
530 MemoryBarrier();
531 return TRUE;
532}
533
534//
535// JIT HELPER ALIASING FOR PORTABILITY.
536//
537// Create alias for optimized implementations of helpers provided on this platform
538//
539#define JIT_GetSharedGCStaticBase JIT_GetSharedGCStaticBase_SingleAppDomain
540#define JIT_GetSharedNonGCStaticBase JIT_GetSharedNonGCStaticBase_SingleAppDomain
541#define JIT_GetSharedGCStaticBaseNoCtor JIT_GetSharedGCStaticBaseNoCtor_SingleAppDomain
542#define JIT_GetSharedNonGCStaticBaseNoCtor JIT_GetSharedNonGCStaticBaseNoCtor_SingleAppDomain
543
544#ifndef FEATURE_PAL
545#define JIT_ChkCastClass JIT_ChkCastClass
546#define JIT_ChkCastClassSpecial JIT_ChkCastClassSpecial
547#define JIT_IsInstanceOfClass JIT_IsInstanceOfClass
548#define JIT_ChkCastInterface JIT_ChkCastInterface
549#define JIT_IsInstanceOfInterface JIT_IsInstanceOfInterface
550#endif // FEATURE_PAL
551
552#define JIT_Stelem_Ref JIT_Stelem_Ref
553
554#endif // __cgencpu_h__
555