1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Amd64/x86 Code Generator XX
9XX XX
10XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12*/
13#include "jitpch.h"
14#ifdef _MSC_VER
15#pragma hdrstop
16#endif
17
18#ifdef _TARGET_XARCH_
19#include "emit.h"
20#include "codegen.h"
21#include "lower.h"
22#include "gcinfo.h"
23#include "gcinfoencoder.h"
24
25/*****************************************************************************
26 *
27 * Generate code that will set the given register to the integer constant.
28 */
29
30void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
31{
32 // Reg cannot be a FP reg
33 assert(!genIsValidFloatReg(reg));
34
35 // The only TYP_REF constant that can come this path is a managed 'null' since it is not
36 // relocatable. Other ref type constants (e.g. string objects) go through a different
37 // code path.
38 noway_assert(type != TYP_REF || val == 0);
39
40 if (val == 0)
41 {
42 instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
43 }
44 else
45 {
46 // TODO-XArch-CQ: needs all the optimized cases
47 getEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
48 }
49}
50
51/*****************************************************************************
52 *
53 * Generate code to check that the GS cookie wasn't thrashed by a buffer
54 * overrun. If pushReg is true, preserve all registers around code sequence.
55 * Otherwise ECX could be modified.
56 *
57 * Implementation Note: pushReg = true, in case of tail calls.
58 */
59void CodeGen::genEmitGSCookieCheck(bool pushReg)
60{
61 noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
62
63 // Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
64 // executing GS cookie check will not collect the object pointed to by EAX.
65 //
66 // For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
67 // In such case make sure that the correct GC-ness of RDX is reported as well, so
68 // a GC object pointed by RDX will not be collected.
69 if (!pushReg)
70 {
71 // Handle multi-reg return type values
72 if (compiler->compMethodReturnsMultiRegRetType())
73 {
74 ReturnTypeDesc retTypeDesc;
75 if (varTypeIsLong(compiler->info.compRetNativeType))
76 {
77 retTypeDesc.InitializeLongReturnType(compiler);
78 }
79 else // we must have a struct return type
80 {
81 retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
82 }
83
84 unsigned regCount = retTypeDesc.GetReturnRegCount();
85
86 // Only x86 and x64 Unix ABI allows multi-reg return and
87 // number of result regs should be equal to MAX_RET_REG_COUNT.
88 assert(regCount == MAX_RET_REG_COUNT);
89
90 for (unsigned i = 0; i < regCount; ++i)
91 {
92 gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
93 }
94 }
95 else if (compiler->compMethodReturnsRetBufAddr())
96 {
97 // This is for returning in an implicit RetBuf.
98 // If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
99
100 // In case the return is in an implicit RetBuf, the native return type should be a struct
101 assert(varTypeIsStruct(compiler->info.compRetNativeType));
102
103 gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
104 }
105 // ... all other cases.
106 else
107 {
108#ifdef _TARGET_AMD64_
109 // For x64, structs that are not returned in registers are always
110 // returned in implicit RetBuf. If we reached here, we should not have
111 // a RetBuf and the return type should not be a struct.
112 assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
113 assert(!varTypeIsStruct(compiler->info.compRetNativeType));
114#endif // _TARGET_AMD64_
115
116 // For x86 Windows we can't make such assertions since we generate code for returning of
117 // the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
118 // compRetNativeType could be TYP_STRUCT.
119 gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
120 }
121 }
122
123 regNumber regGSCheck;
124 regMaskTP regMaskGSCheck = RBM_NONE;
125
126 if (!pushReg)
127 {
128 // Non-tail call: we can use any callee trash register that is not
129 // a return register or contain 'this' pointer (keep alive this), since
130 // we are generating GS cookie check after a GT_RETURN block.
131 // Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
132 // as return register for two-register-returned structs.
133 if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
134 (compiler->lvaTable[compiler->info.compThisArg].lvRegNum == REG_ARG_0))
135 {
136 regGSCheck = REG_ARG_1;
137 }
138 else
139 {
140 regGSCheck = REG_ARG_0;
141 }
142 }
143 else
144 {
145#ifdef _TARGET_X86_
146 // It doesn't matter which register we pick, since we're going to save and restore it
147 // around the check.
148 // TODO-CQ: Can we optimize the choice of register to avoid doing the push/pop sometimes?
149 regGSCheck = REG_EAX;
150 regMaskGSCheck = RBM_EAX;
151#else // !_TARGET_X86_
152 // Tail calls from methods that need GS check: We need to preserve registers while
153 // emitting GS cookie check for a tail prefixed call or a jmp. To emit GS cookie
154 // check, we might need a register. This won't be an issue for jmp calls for the
155 // reason mentioned below (see comment starting with "Jmp Calls:").
156 //
157 // The following are the possible solutions in case of tail prefixed calls:
158 // 1) Use R11 - ignore tail prefix on calls that need to pass a param in R11 when
159 // present in methods that require GS cookie check. Rest of the tail calls that
160 // do not require R11 will be honored.
161 // 2) Internal register - GT_CALL node reserves an internal register and emits GS
162 // cookie check as part of tail call codegen. GenExitCode() needs to special case
163 // fast tail calls implemented as epilog+jmp or such tail calls should always get
164 // dispatched via helper.
165 // 3) Materialize GS cookie check as a separate node hanging off GT_CALL node in
166 // right execution order during rationalization.
167 //
168 // There are two calls that use R11: VSD and calli pinvokes with cookie param. Tail
169 // prefix on pinvokes is ignored. That is, options 2 and 3 will allow tail prefixed
170 // VSD calls from methods that need GS check.
171 //
172 // Tail prefixed calls: Right now for Jit64 compat, method requiring GS cookie check
173 // ignores tail prefix. In future, if we intend to support tail calls from such a method,
174 // consider one of the options mentioned above. For now adding an assert that we don't
175 // expect to see a tail call in a method that requires GS check.
176 noway_assert(!compiler->compTailCallUsed);
177
178 // Jmp calls: specify method handle using which JIT queries VM for its entry point
179 // address and hence it can neither be a VSD call nor PInvoke calli with cookie
180 // parameter. Therefore, in case of jmp calls it is safe to use R11.
181 regGSCheck = REG_R11;
182#endif // !_TARGET_X86_
183 }
184
185 regMaskTP byrefPushedRegs = RBM_NONE;
186 regMaskTP norefPushedRegs = RBM_NONE;
187 regMaskTP pushedRegs = RBM_NONE;
188
189 if (compiler->gsGlobalSecurityCookieAddr == nullptr)
190 {
191#if defined(_TARGET_AMD64_)
192 // If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
193 // Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
194 if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
195 {
196 genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
197 getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
198 }
199 else
200#endif // defined(_TARGET_AMD64_)
201 {
202 assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
203 getEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
204 (int)compiler->gsGlobalSecurityCookieVal);
205 }
206 }
207 else
208 {
209 // Ngen case - GS cookie value needs to be accessed through an indirection.
210
211 pushedRegs = genPushRegs(regMaskGSCheck, &byrefPushedRegs, &norefPushedRegs);
212
213 instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
214 getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
215 getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
216 }
217
218 BasicBlock* gsCheckBlk = genCreateTempLabel();
219 emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
220 inst_JMP(jmpEqual, gsCheckBlk);
221 genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
222 genDefineTempLabel(gsCheckBlk);
223
224 genPopRegs(pushedRegs, byrefPushedRegs, norefPushedRegs);
225}
226
227BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
228{
229#if FEATURE_EH_FUNCLETS
230 // Generate a call to the finally, like this:
231 // mov rcx,qword ptr [rbp + 20H] // Load rcx with PSPSym
232 // call finally-funclet
233 // jmp finally-return // Only for non-retless finally calls
234 // The jmp can be a NOP if we're going to the next block.
235 // If we're generating code for the main function (not a funclet), and there is no localloc,
236 // then RSP at this point is the same value as that stored in the PSPSym. So just copy RSP
237 // instead of loading the PSPSym in this case, or if PSPSym is not used (CoreRT ABI).
238
239 if ((compiler->lvaPSPSym == BAD_VAR_NUM) ||
240 (!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT)))
241 {
242#ifndef UNIX_X86_ABI
243 inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
244#endif // !UNIX_X86_ABI
245 }
246 else
247 {
248 getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
249 }
250 getEmitter()->emitIns_J(INS_call, block->bbJumpDest);
251
252 if (block->bbFlags & BBF_RETLESS_CALL)
253 {
254 // We have a retless call, and the last instruction generated was a call.
255 // If the next block is in a different EH region (or is the end of the code
256 // block), then we need to generate a breakpoint here (since it will never
257 // get executed) to get proper unwind behavior.
258
259 if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
260 {
261 instGen(INS_BREAKPOINT); // This should never get executed
262 }
263 }
264 else
265 {
266// TODO-Linux-x86: Do we need to handle the GC information for this NOP or JMP specially, as is done for other
267// architectures?
268#ifndef JIT32_GCENCODER
269 // Because of the way the flowgraph is connected, the liveness info for this one instruction
270 // after the call is not (can not be) correct in cases where a variable has a last use in the
271 // handler. So turn off GC reporting for this single instruction.
272 getEmitter()->emitDisableGC();
273#endif // JIT32_GCENCODER
274
275 // Now go to where the finally funclet needs to return to.
276 if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
277 {
278 // Fall-through.
279 // TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
280 // to the next instruction? This would depend on stack walking from within the finally
281 // handler working without this instruction being in this special EH region.
282 instGen(INS_nop);
283 }
284 else
285 {
286 inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
287 }
288
289#ifndef JIT32_GCENCODER
290 getEmitter()->emitEnableGC();
291#endif // JIT32_GCENCODER
292 }
293
294#else // !FEATURE_EH_FUNCLETS
295
296 // If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
297 // corresponding to the finally's nesting level. When invoked in response to an exception, the
298 // EE does this.
299 //
300 // We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
301 //
302 // We will emit :
303 // mov [ebp - (n + 1)], 0
304 // mov [ebp - n ], 0xFC
305 // push &step
306 // jmp finallyBlock
307 // ...
308 // step:
309 // mov [ebp - n ], 0
310 // jmp leaveTarget
311 // ...
312 // leaveTarget:
313
314 noway_assert(isFramePointerUsed());
315
316 // Get the nesting level which contains the finally
317 unsigned finallyNesting = 0;
318 compiler->fgGetNestingLevel(block, &finallyNesting);
319
320 // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
321 unsigned filterEndOffsetSlotOffs;
322 filterEndOffsetSlotOffs = (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
323
324 unsigned curNestingSlotOffs;
325 curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
326
327 // Zero out the slot for the next nesting level
328 instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar,
329 curNestingSlotOffs - TARGET_POINTER_SIZE);
330 instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, LCL_FINALLY_MARK, compiler->lvaShadowSPslotsVar,
331 curNestingSlotOffs);
332
333 // Now push the address where the finally funclet should return to directly.
334 if (!(block->bbFlags & BBF_RETLESS_CALL))
335 {
336 assert(block->isBBCallAlwaysPair());
337 getEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
338 }
339 else
340 {
341 // EE expects a DWORD, so we give him 0
342 inst_IV(INS_push_hide, 0);
343 }
344
345 // Jump to the finally BB
346 inst_JMP(EJ_jmp, block->bbJumpDest);
347
348#endif // !FEATURE_EH_FUNCLETS
349
350 // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
351 // jump target using bbJumpDest - that is already used to point
352 // to the finally block. So just skip past the BBJ_ALWAYS unless the
353 // block is RETLESS.
354 if (!(block->bbFlags & BBF_RETLESS_CALL))
355 {
356 assert(block->isBBCallAlwaysPair());
357 block = block->bbNext;
358 }
359 return block;
360}
361
362#if FEATURE_EH_FUNCLETS
363void CodeGen::genEHCatchRet(BasicBlock* block)
364{
365 // Set RAX to the address the VM should return to after the catch.
366 // Generate a RIP-relative
367 // lea reg, [rip + disp32] ; the RIP is implicit
368 // which will be position-independent.
369 getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
370}
371
372#else // !FEATURE_EH_FUNCLETS
373
374void CodeGen::genEHFinallyOrFilterRet(BasicBlock* block)
375{
376 // The last statement of the block must be a GT_RETFILT, which has already been generated.
377 assert(block->lastNode() != nullptr);
378 assert(block->lastNode()->OperGet() == GT_RETFILT);
379
380 if (block->bbJumpKind == BBJ_EHFINALLYRET)
381 {
382 assert(block->lastNode()->gtOp.gtOp1 == nullptr); // op1 == nullptr means endfinally
383
384 // Return using a pop-jmp sequence. As the "try" block calls
385 // the finally with a jmp, this leaves the x86 call-ret stack
386 // balanced in the normal flow of path.
387
388 noway_assert(isFramePointerRequired());
389 inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
390 inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
391 }
392 else
393 {
394 assert(block->bbJumpKind == BBJ_EHFILTERRET);
395
396 // The return value has already been computed.
397 instGen_Return(0);
398 }
399}
400
401#endif // !FEATURE_EH_FUNCLETS
402
403// Move an immediate value into an integer register
404
405void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
406{
407 // reg cannot be a FP register
408 assert(!genIsValidFloatReg(reg));
409
410 if (!compiler->opts.compReloc)
411 {
412 size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
413 }
414
415 if ((imm == 0) && !EA_IS_RELOC(size))
416 {
417 instGen_Set_Reg_To_Zero(size, reg, flags);
418 }
419 else
420 {
421 if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
422 {
423 emitAttr newSize = EA_PTR_DSP_RELOC;
424 if (EA_IS_BYREF(size))
425 {
426 newSize = EA_SET_FLG(newSize, EA_BYREF_FLG);
427 }
428
429 getEmitter()->emitIns_R_AI(INS_lea, newSize, reg, imm);
430 }
431 else
432 {
433 getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
434 }
435 }
436 regSet.verifyRegUsed(reg);
437}
438
439/***********************************************************************************
440 *
441 * Generate code to set a register 'targetReg' of type 'targetType' to the constant
442 * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
443 * genProduceReg() on the target register.
444 */
445void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
446{
447 switch (tree->gtOper)
448 {
449 case GT_CNS_INT:
450 {
451 // relocatable values tend to come down as a CNS_INT of native int type
452 // so the line between these two opcodes is kind of blurry
453 GenTreeIntConCommon* con = tree->AsIntConCommon();
454 ssize_t cnsVal = con->IconValue();
455
456 if (con->ImmedValNeedsReloc(compiler))
457 {
458 emitAttr size = EA_HANDLE_CNS_RELOC;
459
460 if (targetType == TYP_BYREF)
461 {
462 size = EA_SET_FLG(size, EA_BYREF_FLG);
463 }
464
465 instGen_Set_Reg_To_Imm(size, targetReg, cnsVal);
466 regSet.verifyRegUsed(targetReg);
467 }
468 else
469 {
470 genSetRegToIcon(targetReg, cnsVal, targetType);
471 }
472 }
473 break;
474
475 case GT_CNS_DBL:
476 {
477 emitter* emit = getEmitter();
478 emitAttr size = emitTypeSize(targetType);
479 double constValue = tree->gtDblCon.gtDconVal;
480
481 // Make sure we use "xorps reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
482 if (*(__int64*)&constValue == 0)
483 {
484 // A faster/smaller way to generate 0
485 emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg);
486 }
487 else
488 {
489 CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
490 emit->emitIns_R_C(ins_Load(targetType), size, targetReg, hnd, 0);
491 }
492 }
493 break;
494
495 default:
496 unreached();
497 }
498}
499
500//------------------------------------------------------------------------
501// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
502//
503// Arguments:
504// tree - the node
505//
506void CodeGen::genCodeForNegNot(GenTree* tree)
507{
508 assert(tree->OperIs(GT_NEG, GT_NOT));
509
510 regNumber targetReg = tree->gtRegNum;
511 var_types targetType = tree->TypeGet();
512
513 if (varTypeIsFloating(targetType))
514 {
515 assert(tree->gtOper == GT_NEG);
516 genSSE2BitwiseOp(tree);
517 }
518 else
519 {
520 GenTree* operand = tree->gtGetOp1();
521 assert(operand->isUsedFromReg());
522 regNumber operandReg = genConsumeReg(operand);
523
524 if (operandReg != targetReg)
525 {
526 inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
527 }
528
529 instruction ins = genGetInsForOper(tree->OperGet(), targetType);
530 inst_RV(ins, targetReg, targetType);
531 }
532
533 genProduceReg(tree);
534}
535
536//------------------------------------------------------------------------
537// genCodeForBswap: Produce code for a GT_BSWAP / GT_BSWAP16 node.
538//
539// Arguments:
540// tree - the node
541//
542void CodeGen::genCodeForBswap(GenTree* tree)
543{
544 // TODO: If we're swapping immediately after a read from memory or immediately before
545 // a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
546 // the platform supports it.
547
548 assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));
549
550 regNumber targetReg = tree->gtRegNum;
551 var_types targetType = tree->TypeGet();
552
553 GenTree* operand = tree->gtGetOp1();
554 assert(operand->isUsedFromReg());
555 regNumber operandReg = genConsumeReg(operand);
556
557 if (operandReg != targetReg)
558 {
559 inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
560 }
561
562 if (tree->OperIs(GT_BSWAP))
563 {
564 // 32-bit and 64-bit byte swaps use "bswap reg"
565 inst_RV(INS_bswap, targetReg, targetType);
566 }
567 else
568 {
569 // 16-bit byte swaps use "ror reg.16, 8"
570 inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
571 }
572
573 genProduceReg(tree);
574}
575
576// Generate code to get the high N bits of a N*N=2N bit multiplication result
577void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
578{
579 assert(!treeNode->gtOverflowEx());
580
581 regNumber targetReg = treeNode->gtRegNum;
582 var_types targetType = treeNode->TypeGet();
583 emitter* emit = getEmitter();
584 emitAttr size = emitTypeSize(treeNode);
585 GenTree* op1 = treeNode->gtOp.gtOp1;
586 GenTree* op2 = treeNode->gtOp.gtOp2;
587
588 // to get the high bits of the multiply, we are constrained to using the
589 // 1-op form: RDX:RAX = RAX * rm
590 // The 3-op form (Rx=Ry*Rz) does not support it.
591
592 genConsumeOperands(treeNode->AsOp());
593
594 GenTree* regOp = op1;
595 GenTree* rmOp = op2;
596
597 // Set rmOp to the memory operand (if any)
598 if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == REG_RAX)))
599 {
600 regOp = op2;
601 rmOp = op1;
602 }
603 assert(regOp->isUsedFromReg());
604
605 // Setup targetReg when neither of the source operands was a matching register
606 if (regOp->gtRegNum != REG_RAX)
607 {
608 inst_RV_RV(ins_Copy(targetType), REG_RAX, regOp->gtRegNum, targetType);
609 }
610
611 instruction ins;
612 if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
613 {
614 ins = INS_imulEAX;
615 }
616 else
617 {
618 ins = INS_mulEAX;
619 }
620 emit->emitInsBinary(ins, size, treeNode, rmOp);
621
622 // Move the result to the desired register, if necessary
623 if (treeNode->OperGet() == GT_MULHI && targetReg != REG_RDX)
624 {
625 inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
626 }
627
628 genProduceReg(treeNode);
629}
630
631#ifdef _TARGET_X86_
632//------------------------------------------------------------------------
633// genCodeForLongUMod: Generate code for a tree of the form
634// `(umod (gt_long x y) (const int))`
635//
636// Arguments:
637// node - the node for which to generate code
638//
639void CodeGen::genCodeForLongUMod(GenTreeOp* node)
640{
641 assert(node != nullptr);
642 assert(node->OperGet() == GT_UMOD);
643 assert(node->TypeGet() == TYP_INT);
644
645 GenTreeOp* const dividend = node->gtOp1->AsOp();
646 assert(dividend->OperGet() == GT_LONG);
647 assert(varTypeIsLong(dividend));
648
649 genConsumeOperands(node);
650
651 GenTree* const dividendLo = dividend->gtOp1;
652 GenTree* const dividendHi = dividend->gtOp2;
653 assert(dividendLo->isUsedFromReg());
654 assert(dividendHi->isUsedFromReg());
655
656 GenTree* const divisor = node->gtOp2;
657 assert(divisor->gtSkipReloadOrCopy()->OperGet() == GT_CNS_INT);
658 assert(divisor->gtSkipReloadOrCopy()->isUsedFromReg());
659 assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal >= 2);
660 assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal <= 0x3fffffff);
661
662 // dividendLo must be in RAX; dividendHi must be in RDX
663 genCopyRegIfNeeded(dividendLo, REG_EAX);
664 genCopyRegIfNeeded(dividendHi, REG_EDX);
665
666 // At this point, EAX:EDX contains the 64bit dividend and op2->gtRegNum
667 // contains the 32bit divisor. We want to generate the following code:
668 //
669 // cmp edx, divisor->gtRegNum
670 // jb noOverflow
671 //
672 // mov temp, eax
673 // mov eax, edx
674 // xor edx, edx
675 // div divisor->gtRegNum
676 // mov eax, temp
677 //
678 // noOverflow:
679 // div divisor->gtRegNum
680 //
681 // This works because (a * 2^32 + b) % c = ((a % c) * 2^32 + b) % c.
682
683 BasicBlock* const noOverflow = genCreateTempLabel();
684
685 // cmp edx, divisor->gtRegNum
686 // jb noOverflow
687 inst_RV_RV(INS_cmp, REG_EDX, divisor->gtRegNum);
688 inst_JMP(EJ_jb, noOverflow);
689
690 // mov temp, eax
691 // mov eax, edx
692 // xor edx, edx
693 // div divisor->gtRegNum
694 // mov eax, temp
695 const regNumber tempReg = node->GetSingleTempReg();
696 inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
697 inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
698 instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
699 inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
700 inst_RV_RV(INS_mov, REG_EAX, tempReg, TYP_INT);
701
702 // noOverflow:
703 // div divisor->gtRegNum
704 genDefineTempLabel(noOverflow);
705 inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
706
707 const regNumber targetReg = node->gtRegNum;
708 if (targetReg != REG_EDX)
709 {
710 inst_RV_RV(INS_mov, targetReg, REG_RDX, TYP_INT);
711 }
712 genProduceReg(node);
713}
714#endif // _TARGET_X86_
715
716//------------------------------------------------------------------------
717// genCodeForDivMod: Generate code for a DIV or MOD operation.
718//
719// Arguments:
720// treeNode - the node to generate the code for
721//
722void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
723{
724 assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
725
726 GenTree* dividend = treeNode->gtOp1;
727
728#ifdef _TARGET_X86_
729 if (varTypeIsLong(dividend->TypeGet()))
730 {
731 genCodeForLongUMod(treeNode);
732 return;
733 }
734#endif // _TARGET_X86_
735
736 GenTree* divisor = treeNode->gtOp2;
737 genTreeOps oper = treeNode->OperGet();
738 emitAttr size = emitTypeSize(treeNode);
739 regNumber targetReg = treeNode->gtRegNum;
740 var_types targetType = treeNode->TypeGet();
741 emitter* emit = getEmitter();
742
743 // Node's type must be int/native int, small integer types are not
744 // supported and floating point types are handled by genCodeForBinary.
745 assert(varTypeIsIntOrI(targetType));
746 // dividend is in a register.
747 assert(dividend->isUsedFromReg());
748
749 genConsumeOperands(treeNode->AsOp());
750 // dividend must be in RAX
751 genCopyRegIfNeeded(dividend, REG_RAX);
752
753 // zero or sign extend rax to rdx
754 if (oper == GT_UMOD || oper == GT_UDIV)
755 {
756 instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
757 }
758 else
759 {
760 emit->emitIns(INS_cdq, size);
761 // the cdq instruction writes RDX, So clear the gcInfo for RDX
762 gcInfo.gcMarkRegSetNpt(RBM_RDX);
763 }
764
765 // Perform the 'targetType' (64-bit or 32-bit) divide instruction
766 instruction ins;
767 if (oper == GT_UMOD || oper == GT_UDIV)
768 {
769 ins = INS_div;
770 }
771 else
772 {
773 ins = INS_idiv;
774 }
775
776 emit->emitInsBinary(ins, size, treeNode, divisor);
777
778 // DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
779 // Move the result to the desired register, if necessary
780 if (oper == GT_DIV || oper == GT_UDIV)
781 {
782 if (targetReg != REG_RAX)
783 {
784 inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
785 }
786 }
787 else
788 {
789 assert((oper == GT_MOD) || (oper == GT_UMOD));
790 if (targetReg != REG_RDX)
791 {
792 inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
793 }
794 }
795 genProduceReg(treeNode);
796}
797
798//------------------------------------------------------------------------
799// genCodeForBinary: Generate code for many binary arithmetic operators
800//
801// Arguments:
802// treeNode - The binary operation for which we are generating code.
803//
804// Return Value:
805// None.
806//
807// Notes:
808// Integer MUL and DIV variants have special constraints on x64 so are not handled here.
809// See the assert below for the operators that are handled.
810
811void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
812{
813#ifdef DEBUG
814 bool isValidOper = treeNode->OperIs(GT_ADD, GT_SUB);
815 if (varTypeIsFloating(treeNode->TypeGet()))
816 {
817 isValidOper |= treeNode->OperIs(GT_MUL, GT_DIV);
818 }
819 else
820 {
821 isValidOper |= treeNode->OperIs(GT_AND, GT_OR, GT_XOR);
822#ifndef _TARGET_64BIT_
823 isValidOper |= treeNode->OperIs(GT_ADD_LO, GT_ADD_HI, GT_SUB_LO, GT_SUB_HI);
824#endif
825 }
826 assert(isValidOper);
827#endif
828
829 genConsumeOperands(treeNode);
830
831 const genTreeOps oper = treeNode->OperGet();
832 regNumber targetReg = treeNode->gtRegNum;
833 var_types targetType = treeNode->TypeGet();
834 emitter* emit = getEmitter();
835
836 GenTree* op1 = treeNode->gtGetOp1();
837 GenTree* op2 = treeNode->gtGetOp2();
838
839 // Commutative operations can mark op1 as contained or reg-optional to generate "op reg, memop/immed"
840 if (!op1->isUsedFromReg())
841 {
842 assert(treeNode->OperIsCommutative());
843 assert(op1->isMemoryOp() || op1->IsLocal() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() ||
844 op1->IsRegOptional());
845
846 op1 = treeNode->gtGetOp2();
847 op2 = treeNode->gtGetOp1();
848 }
849
850 instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
851
852 // The arithmetic node must be sitting in a register (since it's not contained)
853 noway_assert(targetReg != REG_NA);
854
855 regNumber op1reg = op1->isUsedFromReg() ? op1->gtRegNum : REG_NA;
856 regNumber op2reg = op2->isUsedFromReg() ? op2->gtRegNum : REG_NA;
857
858 GenTree* dst;
859 GenTree* src;
860
861 // This is the case of reg1 = reg1 op reg2
862 // We're ready to emit the instruction without any moves
863 if (op1reg == targetReg)
864 {
865 dst = op1;
866 src = op2;
867 }
868 // We have reg1 = reg2 op reg1
869 // In order for this operation to be correct
870 // we need that op is a commutative operation so
871 // we can convert it into reg1 = reg1 op reg2 and emit
872 // the same code as above
873 else if (op2reg == targetReg)
874 {
875 noway_assert(GenTree::OperIsCommutative(oper));
876 dst = op2;
877 src = op1;
878 }
879 // now we know there are 3 different operands so attempt to use LEA
880 else if (oper == GT_ADD && !varTypeIsFloating(treeNode) && !treeNode->gtOverflowEx() // LEA does not set flags
881 && (op2->isContainedIntOrIImmed() || op2->isUsedFromReg()) && !treeNode->gtSetFlags())
882 {
883 if (op2->isContainedIntOrIImmed())
884 {
885 emit->emitIns_R_AR(INS_lea, emitTypeSize(treeNode), targetReg, op1reg,
886 (int)op2->AsIntConCommon()->IconValue());
887 }
888 else
889 {
890 assert(op2reg != REG_NA);
891 emit->emitIns_R_ARX(INS_lea, emitTypeSize(treeNode), targetReg, op1reg, op2reg, 1, 0);
892 }
893 genProduceReg(treeNode);
894 return;
895 }
896 // dest, op1 and op2 registers are different:
897 // reg3 = reg1 op reg2
898 // We can implement this by issuing a mov:
899 // reg3 = reg1
900 // reg3 = reg3 op reg2
901 else
902 {
903 inst_RV_RV(ins_Copy(targetType), targetReg, op1reg, targetType);
904 regSet.verifyRegUsed(targetReg);
905 gcInfo.gcMarkRegPtrVal(targetReg, targetType);
906 dst = treeNode;
907 src = op2;
908 }
909
910 // try to use an inc or dec
911 if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
912 {
913 if (src->IsIntegralConst(1))
914 {
915 emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
916 genProduceReg(treeNode);
917 return;
918 }
919 else if (src->IsIntegralConst(-1))
920 {
921 emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
922 genProduceReg(treeNode);
923 return;
924 }
925 }
926 regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
927 noway_assert(r == targetReg);
928
929 if (treeNode->gtOverflowEx())
930 {
931#if !defined(_TARGET_64BIT_)
932 assert(oper == GT_ADD || oper == GT_SUB || oper == GT_ADD_HI || oper == GT_SUB_HI);
933#else
934 assert(oper == GT_ADD || oper == GT_SUB);
935#endif
936 genCheckOverflow(treeNode);
937 }
938 genProduceReg(treeNode);
939}
940
941//------------------------------------------------------------------------
942// genCodeForMul: Generate code for a MUL operation.
943//
944// Arguments:
945// treeNode - the node to generate the code for
946//
947void CodeGen::genCodeForMul(GenTreeOp* treeNode)
948{
949 assert(treeNode->OperIs(GT_MUL));
950
951 regNumber targetReg = treeNode->gtRegNum;
952 var_types targetType = treeNode->TypeGet();
953 emitter* emit = getEmitter();
954
955 // Node's type must be int or long (only on x64), small integer types are not
956 // supported and floating point types are handled by genCodeForBinary.
957 assert(varTypeIsIntOrI(targetType));
958
959 instruction ins;
960 emitAttr size = emitTypeSize(treeNode);
961 bool isUnsignedMultiply = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
962 bool requiresOverflowCheck = treeNode->gtOverflowEx();
963
964 GenTree* op1 = treeNode->gtGetOp1();
965 GenTree* op2 = treeNode->gtGetOp2();
966
967 // there are 3 forms of x64 multiply:
968 // 1-op form with 128 result: RDX:RAX = RAX * rm
969 // 2-op form: reg *= rm
970 // 3-op form: reg = rm * imm
971
972 genConsumeOperands(treeNode);
973
974 // This matches the 'mul' lowering in Lowering::SetMulOpCounts()
975 //
976 // immOp :: Only one operand can be an immediate
977 // rmOp :: Only one operand can be a memory op.
978 // regOp :: A register op (especially the operand that matches 'targetReg')
979 // (can be nullptr when we have both a memory op and an immediate op)
980
981 GenTree* immOp = nullptr;
982 GenTree* rmOp = op1;
983 GenTree* regOp;
984
985 if (op2->isContainedIntOrIImmed())
986 {
987 immOp = op2;
988 }
989 else if (op1->isContainedIntOrIImmed())
990 {
991 immOp = op1;
992 rmOp = op2;
993 }
994
995 if (immOp != nullptr)
996 {
997 // CQ: When possible use LEA for mul by imm 3, 5 or 9
998 ssize_t imm = immOp->AsIntConCommon()->IconValue();
999
1000 if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
1001 {
1002 // We will use the LEA instruction to perform this multiply
1003 // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
1004 unsigned int scale = (unsigned int)(imm - 1);
1005 getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
1006 }
1007 else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0))
1008 {
1009 // Use shift for constant multiply when legal
1010 uint64_t zextImm = static_cast<uint64_t>(static_cast<size_t>(imm));
1011 unsigned int shiftAmount = genLog2(zextImm);
1012
1013 if (targetReg != rmOp->gtRegNum)
1014 {
1015 // Copy reg src to dest register
1016 inst_RV_RV(INS_mov, targetReg, rmOp->gtRegNum, targetType);
1017 }
1018 inst_RV_SH(INS_shl, size, targetReg, shiftAmount);
1019 }
1020 else
1021 {
1022 // use the 3-op form with immediate
1023 ins = getEmitter()->inst3opImulForReg(targetReg);
1024 emit->emitInsBinary(ins, size, rmOp, immOp);
1025 }
1026 }
1027 else // we have no contained immediate operand
1028 {
1029 regOp = op1;
1030 rmOp = op2;
1031
1032 regNumber mulTargetReg = targetReg;
1033 if (isUnsignedMultiply && requiresOverflowCheck)
1034 {
1035 ins = INS_mulEAX;
1036 mulTargetReg = REG_RAX;
1037 }
1038 else
1039 {
1040 ins = INS_imul;
1041 }
1042
1043 // Set rmOp to the memory operand (if any)
1044 // or set regOp to the op2 when it has the matching target register for our multiply op
1045 //
1046 if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == mulTargetReg)))
1047 {
1048 regOp = op2;
1049 rmOp = op1;
1050 }
1051 assert(regOp->isUsedFromReg());
1052
1053 // Setup targetReg when neither of the source operands was a matching register
1054 if (regOp->gtRegNum != mulTargetReg)
1055 {
1056 inst_RV_RV(INS_mov, mulTargetReg, regOp->gtRegNum, targetType);
1057 }
1058
1059 emit->emitInsBinary(ins, size, treeNode, rmOp);
1060
1061 // Move the result to the desired register, if necessary
1062 if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
1063 {
1064 inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
1065 }
1066 }
1067
1068 if (requiresOverflowCheck)
1069 {
1070 // Overflow checking is only used for non-floating point types
1071 noway_assert(!varTypeIsFloating(treeNode));
1072
1073 genCheckOverflow(treeNode);
1074 }
1075
1076 genProduceReg(treeNode);
1077}
1078
1079//------------------------------------------------------------------------
1080// isStructReturn: Returns whether the 'treeNode' is returning a struct.
1081//
1082// Arguments:
1083// treeNode - The tree node to evaluate whether is a struct return.
1084//
1085// Return Value:
1086// For AMD64 *nix: returns true if the 'treeNode" is a GT_RETURN node, of type struct.
1087// Otherwise returns false.
1088// For other platforms always returns false.
1089//
1090bool CodeGen::isStructReturn(GenTree* treeNode)
1091{
1092 // This method could be called for 'treeNode' of GT_RET_FILT or GT_RETURN.
1093 // For the GT_RET_FILT, the return is always
1094 // a bool or a void, for the end of a finally block.
1095 noway_assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
1096 if (treeNode->OperGet() != GT_RETURN)
1097 {
1098 return false;
1099 }
1100
1101#ifdef UNIX_AMD64_ABI
1102 return varTypeIsStruct(treeNode);
1103#else // !UNIX_AMD64_ABI
1104 assert(!varTypeIsStruct(treeNode));
1105 return false;
1106#endif // UNIX_AMD64_ABI
1107}
1108
1109//------------------------------------------------------------------------
1110// genStructReturn: Generates code for returning a struct.
1111//
1112// Arguments:
1113// treeNode - The GT_RETURN tree node.
1114//
1115// Return Value:
1116// None
1117//
1118// Assumption:
1119// op1 of GT_RETURN node is either GT_LCL_VAR or multi-reg GT_CALL
1120void CodeGen::genStructReturn(GenTree* treeNode)
1121{
1122 assert(treeNode->OperGet() == GT_RETURN);
1123 GenTree* op1 = treeNode->gtGetOp1();
1124
1125#ifdef UNIX_AMD64_ABI
1126 if (op1->OperGet() == GT_LCL_VAR)
1127 {
1128 GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon();
1129 LclVarDsc* varDsc = &(compiler->lvaTable[lclVar->gtLclNum]);
1130 assert(varDsc->lvIsMultiRegRet);
1131
1132 ReturnTypeDesc retTypeDesc;
1133 retTypeDesc.InitializeStructReturnType(compiler, varDsc->lvVerTypeInfo.GetClassHandle());
1134 unsigned regCount = retTypeDesc.GetReturnRegCount();
1135 assert(regCount == MAX_RET_REG_COUNT);
1136
1137 if (varTypeIsEnregisterableStruct(op1))
1138 {
1139 // Right now the only enregistrable structs supported are SIMD vector types.
1140 assert(varTypeIsSIMD(op1));
1141 assert(op1->isUsedFromReg());
1142
1143 // This is a case of operand is in a single reg and needs to be
1144 // returned in multiple ABI return registers.
1145 regNumber opReg = genConsumeReg(op1);
1146 regNumber reg0 = retTypeDesc.GetABIReturnReg(0);
1147 regNumber reg1 = retTypeDesc.GetABIReturnReg(1);
1148
1149 if (opReg != reg0 && opReg != reg1)
1150 {
1151 // Operand reg is different from return regs.
1152 // Copy opReg to reg0 and let it to be handled by one of the
1153 // two cases below.
1154 inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
1155 opReg = reg0;
1156 }
1157
1158 if (opReg == reg0)
1159 {
1160 assert(opReg != reg1);
1161
1162 // reg0 - already has required 8-byte in bit position [63:0].
1163 // reg1 = opReg.
1164 // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
1165 inst_RV_RV(ins_Copy(TYP_DOUBLE), reg1, opReg, TYP_DOUBLE);
1166 }
1167 else
1168 {
1169 assert(opReg == reg1);
1170
1171 // reg0 = opReg.
1172 // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
1173 inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
1174 }
1175 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
1176 }
1177 else
1178 {
1179 assert(op1->isUsedFromMemory());
1180
1181 // Copy var on stack into ABI return registers
1182 int offset = 0;
1183 for (unsigned i = 0; i < regCount; ++i)
1184 {
1185 var_types type = retTypeDesc.GetReturnRegType(i);
1186 regNumber reg = retTypeDesc.GetABIReturnReg(i);
1187 getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), reg, lclVar->gtLclNum, offset);
1188 offset += genTypeSize(type);
1189 }
1190 }
1191 }
1192 else
1193 {
1194 assert(op1->IsMultiRegCall() || op1->IsCopyOrReloadOfMultiRegCall());
1195
1196 genConsumeRegs(op1);
1197
1198 GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
1199 GenTreeCall* call = actualOp1->AsCall();
1200 ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
1201 unsigned regCount = retTypeDesc->GetReturnRegCount();
1202 assert(regCount == MAX_RET_REG_COUNT);
1203
1204 // Handle circular dependency between call allocated regs and ABI return regs.
1205 //
1206 // It is possible under LSRA stress that originally allocated regs of call node,
1207 // say rax and rdx, are spilled and reloaded to rdx and rax respectively. But
1208 // GT_RETURN needs to move values as follows: rdx->rax, rax->rdx. Similar kind
1209 // kind of circular dependency could arise between xmm0 and xmm1 return regs.
1210 // Codegen is expected to handle such circular dependency.
1211 //
1212 var_types regType0 = retTypeDesc->GetReturnRegType(0);
1213 regNumber returnReg0 = retTypeDesc->GetABIReturnReg(0);
1214 regNumber allocatedReg0 = call->GetRegNumByIdx(0);
1215
1216 var_types regType1 = retTypeDesc->GetReturnRegType(1);
1217 regNumber returnReg1 = retTypeDesc->GetABIReturnReg(1);
1218 regNumber allocatedReg1 = call->GetRegNumByIdx(1);
1219
1220 if (op1->IsCopyOrReload())
1221 {
1222 // GT_COPY/GT_RELOAD will have valid reg for those positions
1223 // that need to be copied or reloaded.
1224 regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
1225 if (reloadReg != REG_NA)
1226 {
1227 allocatedReg0 = reloadReg;
1228 }
1229
1230 reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
1231 if (reloadReg != REG_NA)
1232 {
1233 allocatedReg1 = reloadReg;
1234 }
1235 }
1236
1237 if (allocatedReg0 == returnReg1 && allocatedReg1 == returnReg0)
1238 {
1239 // Circular dependency - swap allocatedReg0 and allocatedReg1
1240 if (varTypeIsFloating(regType0))
1241 {
1242 assert(varTypeIsFloating(regType1));
1243
1244 // The fastest way to swap two XMM regs is using PXOR
1245 inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
1246 inst_RV_RV(INS_pxor, allocatedReg1, allocatedReg0, TYP_DOUBLE);
1247 inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
1248 }
1249 else
1250 {
1251 assert(varTypeIsIntegral(regType0));
1252 assert(varTypeIsIntegral(regType1));
1253 inst_RV_RV(INS_xchg, allocatedReg1, allocatedReg0, TYP_I_IMPL);
1254 }
1255 }
1256 else if (allocatedReg1 == returnReg0)
1257 {
1258 // Change the order of moves to correctly handle dependency.
1259 if (allocatedReg1 != returnReg1)
1260 {
1261 inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
1262 }
1263
1264 if (allocatedReg0 != returnReg0)
1265 {
1266 inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
1267 }
1268 }
1269 else
1270 {
1271 // No circular dependency case.
1272 if (allocatedReg0 != returnReg0)
1273 {
1274 inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
1275 }
1276
1277 if (allocatedReg1 != returnReg1)
1278 {
1279 inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
1280 }
1281 }
1282 }
1283#else
1284 unreached();
1285#endif
1286}
1287
1288#if defined(_TARGET_X86_)
1289
1290//------------------------------------------------------------------------
1291// genFloatReturn: Generates code for float return statement for x86.
1292//
1293// Note: treeNode's and op1's registers are already consumed.
1294//
1295// Arguments:
1296// treeNode - The GT_RETURN or GT_RETFILT tree node with float type.
1297//
1298// Return Value:
1299// None
1300//
1301void CodeGen::genFloatReturn(GenTree* treeNode)
1302{
1303 assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
1304 assert(varTypeIsFloating(treeNode));
1305
1306 GenTree* op1 = treeNode->gtGetOp1();
1307 // Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
1308 // If it already has a home location, use that. Otherwise, we need a temp.
1309 if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvOnFrame)
1310 {
1311 if (compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegNum != REG_STK)
1312 {
1313 op1->gtFlags |= GTF_SPILL;
1314 inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1,
1315 op1->gtRegNum);
1316 }
1317 // Now, load it to the fp stack.
1318 getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
1319 }
1320 else
1321 {
1322 // Spill the value, which should be in a register, then load it to the fp stack.
1323 // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
1324 op1->gtFlags |= GTF_SPILL;
1325 regSet.rsSpillTree(op1->gtRegNum, op1);
1326 op1->gtFlags |= GTF_SPILLED;
1327 op1->gtFlags &= ~GTF_SPILL;
1328
1329 TempDsc* t = regSet.rsUnspillInPlace(op1, op1->gtRegNum);
1330 inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
1331 op1->gtFlags &= ~GTF_SPILLED;
1332 regSet.tmpRlsTemp(t);
1333 }
1334}
1335#endif // _TARGET_X86_
1336
1337//------------------------------------------------------------------------
1338// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
1339//
1340// Arguments:
1341// tree - the node
1342//
1343void CodeGen::genCodeForCompare(GenTreeOp* tree)
1344{
1345 assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
1346
1347 // TODO-XArch-CQ: Check if we can use the currently set flags.
1348 // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
1349 // (signed < or >= where targetReg != REG_NA)
1350
1351 GenTree* op1 = tree->gtOp1;
1352 var_types op1Type = op1->TypeGet();
1353
1354 if (varTypeIsFloating(op1Type))
1355 {
1356 genCompareFloat(tree);
1357 }
1358 else
1359 {
1360 genCompareInt(tree);
1361 }
1362}
1363
1364//------------------------------------------------------------------------
1365// genCodeForBT: Generates code for a GT_BT node.
1366//
1367// Arguments:
1368// tree - The node.
1369//
1370void CodeGen::genCodeForBT(GenTreeOp* bt)
1371{
1372 assert(bt->OperIs(GT_BT));
1373
1374 GenTree* op1 = bt->gtGetOp1();
1375 GenTree* op2 = bt->gtGetOp2();
1376 var_types type = genActualType(op1->TypeGet());
1377
1378 assert(op1->isUsedFromReg() && op2->isUsedFromReg());
1379 assert((genTypeSize(type) >= genTypeSize(TYP_INT)) && (genTypeSize(type) <= genTypeSize(TYP_I_IMPL)));
1380
1381 genConsumeOperands(bt);
1382 // Note that the emitter doesn't fully support INS_bt, it only supports the reg,reg
1383 // form and encodes the registers in reverse order. To get the correct order we need
1384 // to reverse the operands when calling emitIns_R_R.
1385 getEmitter()->emitIns_R_R(INS_bt, emitTypeSize(type), op2->gtRegNum, op1->gtRegNum);
1386}
1387
1388//------------------------------------------------------------------------
1389// genCodeForJumpTrue: Generates code for jmpTrue statement.
1390//
1391// Arguments:
1392// tree - The GT_JTRUE tree node.
1393//
1394// Return Value:
1395// None
1396//
1397void CodeGen::genCodeForJumpTrue(GenTree* tree)
1398{
1399 GenTree* cmp = tree->gtOp.gtOp1;
1400
1401 assert(cmp->OperIsCompare());
1402 assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
1403
1404#if !defined(_TARGET_64BIT_)
1405 // Long-typed compares should have been handled by Lowering::LowerCompare.
1406 assert(!varTypeIsLong(cmp->gtGetOp1()));
1407#endif
1408
1409 // Get the "kind" and type of the comparison. Note that whether it is an unsigned cmp
1410 // is governed by a flag NOT by the inherent type of the node
1411 // TODO-XArch-CQ: Check if we can use the currently set flags.
1412 emitJumpKind jumpKind[2];
1413 bool branchToTrueLabel[2];
1414 genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
1415
1416 BasicBlock* skipLabel = nullptr;
1417 if (jumpKind[0] != EJ_NONE)
1418 {
1419 BasicBlock* jmpTarget;
1420 if (branchToTrueLabel[0])
1421 {
1422 jmpTarget = compiler->compCurBB->bbJumpDest;
1423 }
1424 else
1425 {
1426 // This case arises only for ordered GT_EQ right now
1427 assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
1428 skipLabel = genCreateTempLabel();
1429 jmpTarget = skipLabel;
1430 }
1431
1432 inst_JMP(jumpKind[0], jmpTarget);
1433 }
1434
1435 if (jumpKind[1] != EJ_NONE)
1436 {
1437 // the second conditional branch always has to be to the true label
1438 assert(branchToTrueLabel[1]);
1439 inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
1440 }
1441
1442 if (skipLabel != nullptr)
1443 {
1444 genDefineTempLabel(skipLabel);
1445 }
1446}
1447
1448//------------------------------------------------------------------------
1449// genCodeForJcc: Produce code for a GT_JCC node.
1450//
1451// Arguments:
1452// tree - the node
1453//
1454void CodeGen::genCodeForJcc(GenTreeCC* tree)
1455{
1456 assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
1457
1458 CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
1459 emitJumpKind jumpKind = genJumpKindForOper(tree->gtCondition, compareKind);
1460
1461 inst_JMP(jumpKind, compiler->compCurBB->bbJumpDest);
1462}
1463
1464//------------------------------------------------------------------------
1465// genCodeForSetcc: Generates a setcc instruction for a GT_SETCC node.
1466//
1467// Arguments:
1468// tree - the GT_SETCC node
1469//
1470// Assumptions:
1471// The condition represents an integer comparison. This code doesn't
1472// have the necessary logic to deal with floating point comparisons,
1473// in fact it doesn't even know if the comparison is integer or floating
1474// point because SETCC nodes do not have any operands.
1475//
1476
1477void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
1478{
1479 regNumber dstReg = setcc->gtRegNum;
1480 CompareKind compareKind = setcc->IsUnsigned() ? CK_UNSIGNED : CK_SIGNED;
1481 emitJumpKind jumpKind = genJumpKindForOper(setcc->gtCondition, compareKind);
1482
1483 assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
1484 // Make sure nobody is setting GTF_RELOP_NAN_UN on this node as it is ignored.
1485 assert((setcc->gtFlags & GTF_RELOP_NAN_UN) == 0);
1486
1487 inst_SET(jumpKind, dstReg);
1488 inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
1489 genProduceReg(setcc);
1490}
1491
1492//------------------------------------------------------------------------
1493// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
1494//
1495// Arguments:
1496// tree - the GT_RETURNTRAP node
1497//
1498void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
1499{
1500 assert(tree->OperGet() == GT_RETURNTRAP);
1501
1502 // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
1503 // based on the contents of 'data'
1504
1505 GenTree* data = tree->gtOp1;
1506 genConsumeRegs(data);
1507 GenTreeIntCon cns = intForm(TYP_INT, 0);
1508 cns.SetContained();
1509 getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
1510
1511 BasicBlock* skipLabel = genCreateTempLabel();
1512
1513 emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
1514 inst_JMP(jmpEqual, skipLabel);
1515
1516 // emit the call to the EE-helper that stops for GC (or other reasons)
1517 regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
1518 assert(genIsValidIntReg(tmpReg));
1519
1520 genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
1521 genDefineTempLabel(skipLabel);
1522}
1523
1524/*****************************************************************************
1525 *
1526 * Generate code for a single node in the tree.
1527 * Preconditions: All operands have been evaluated
1528 *
1529 */
1530void CodeGen::genCodeForTreeNode(GenTree* treeNode)
1531{
1532 regNumber targetReg;
1533#if !defined(_TARGET_64BIT_)
1534 if (treeNode->TypeGet() == TYP_LONG)
1535 {
1536 // All long enregistered nodes will have been decomposed into their
1537 // constituent lo and hi nodes.
1538 targetReg = REG_NA;
1539 }
1540 else
1541#endif // !defined(_TARGET_64BIT_)
1542 {
1543 targetReg = treeNode->gtRegNum;
1544 }
1545 var_types targetType = treeNode->TypeGet();
1546 emitter* emit = getEmitter();
1547
1548#ifdef DEBUG
1549 // Validate that all the operands for the current node are consumed in order.
1550 // This is important because LSRA ensures that any necessary copies will be
1551 // handled correctly.
1552 lastConsumedNode = nullptr;
1553 if (compiler->verbose)
1554 {
1555 unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio
1556 compiler->gtDispLIRNode(treeNode, "Generating: ");
1557 }
1558#endif // DEBUG
1559
1560 // Is this a node whose value is already in a register? LSRA denotes this by
1561 // setting the GTF_REUSE_REG_VAL flag.
1562 if (treeNode->IsReuseRegVal())
1563 {
1564 // For now, this is only used for constant nodes.
1565 assert((treeNode->OperIsConst()));
1566 JITDUMP(" TreeNode is marked ReuseReg\n");
1567 return;
1568 }
1569
1570 // contained nodes are part of their parents for codegen purposes
1571 // ex : immediates, most LEAs
1572 if (treeNode->isContained())
1573 {
1574 return;
1575 }
1576
1577 switch (treeNode->gtOper)
1578 {
1579#ifndef JIT32_GCENCODER
1580 case GT_START_NONGC:
1581 getEmitter()->emitDisableGC();
1582 break;
1583#endif // !defined(JIT32_GCENCODER)
1584
1585 case GT_PROF_HOOK:
1586#ifdef PROFILING_SUPPORTED
1587 // We should be seeing this only if profiler hook is needed
1588 noway_assert(compiler->compIsProfilerHookNeeded());
1589
1590 // Right now this node is used only for tail calls. In future if
1591 // we intend to use it for Enter or Leave hooks, add a data member
1592 // to this node indicating the kind of profiler hook. For example,
1593 // helper number can be used.
1594 genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
1595#endif // PROFILING_SUPPORTED
1596 break;
1597
1598 case GT_LCLHEAP:
1599 genLclHeap(treeNode);
1600 break;
1601
1602 case GT_CNS_INT:
1603#ifdef _TARGET_X86_
1604 assert(!treeNode->IsIconHandle(GTF_ICON_TLS_HDL));
1605#endif // _TARGET_X86_
1606 __fallthrough;
1607
1608 case GT_CNS_DBL:
1609 genSetRegToConst(targetReg, targetType, treeNode);
1610 genProduceReg(treeNode);
1611 break;
1612
1613 case GT_NOT:
1614 case GT_NEG:
1615 genCodeForNegNot(treeNode);
1616 break;
1617
1618 case GT_BSWAP:
1619 case GT_BSWAP16:
1620 genCodeForBswap(treeNode);
1621 break;
1622
1623 case GT_DIV:
1624 if (varTypeIsFloating(treeNode->TypeGet()))
1625 {
1626 genCodeForBinary(treeNode->AsOp());
1627 break;
1628 }
1629 __fallthrough;
1630 case GT_MOD:
1631 case GT_UMOD:
1632 case GT_UDIV:
1633 genCodeForDivMod(treeNode->AsOp());
1634 break;
1635
1636 case GT_OR:
1637 case GT_XOR:
1638 case GT_AND:
1639 assert(varTypeIsIntegralOrI(treeNode));
1640
1641 __fallthrough;
1642
1643#if !defined(_TARGET_64BIT_)
1644 case GT_ADD_LO:
1645 case GT_ADD_HI:
1646 case GT_SUB_LO:
1647 case GT_SUB_HI:
1648#endif // !defined(_TARGET_64BIT_)
1649
1650 case GT_ADD:
1651 case GT_SUB:
1652 genCodeForBinary(treeNode->AsOp());
1653 break;
1654
1655 case GT_MUL:
1656 if (varTypeIsFloating(treeNode->TypeGet()))
1657 {
1658 genCodeForBinary(treeNode->AsOp());
1659 break;
1660 }
1661 genCodeForMul(treeNode->AsOp());
1662 break;
1663
1664 case GT_LSH:
1665 case GT_RSH:
1666 case GT_RSZ:
1667 case GT_ROL:
1668 case GT_ROR:
1669 genCodeForShift(treeNode);
1670 break;
1671
1672#if !defined(_TARGET_64BIT_)
1673
1674 case GT_LSH_HI:
1675 case GT_RSH_LO:
1676 genCodeForShiftLong(treeNode);
1677 break;
1678
1679#endif // !defined(_TARGET_64BIT_)
1680
1681 case GT_CAST:
1682 genCodeForCast(treeNode->AsOp());
1683 break;
1684
1685 case GT_BITCAST:
1686 {
1687 GenTree* const op1 = treeNode->AsOp()->gtOp1;
1688 genConsumeReg(op1);
1689
1690 const bool srcFltReg = varTypeIsFloating(op1) || varTypeIsSIMD(op1);
1691 const bool dstFltReg = varTypeIsFloating(treeNode) || varTypeIsSIMD(treeNode);
1692 if (srcFltReg != dstFltReg)
1693 {
1694 instruction ins;
1695 regNumber fltReg;
1696 regNumber intReg;
1697 if (dstFltReg)
1698 {
1699 ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
1700 fltReg = treeNode->gtRegNum;
1701 intReg = op1->gtRegNum;
1702 }
1703 else
1704 {
1705 ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
1706 intReg = treeNode->gtRegNum;
1707 fltReg = op1->gtRegNum;
1708 }
1709 inst_RV_RV(ins, fltReg, intReg, treeNode->TypeGet());
1710 }
1711 else if (treeNode->gtRegNum != op1->gtRegNum)
1712 {
1713 inst_RV_RV(ins_Copy(treeNode->TypeGet()), treeNode->gtRegNum, op1->gtRegNum, treeNode->TypeGet());
1714 }
1715
1716 genProduceReg(treeNode);
1717 break;
1718 }
1719
1720 case GT_LCL_FLD_ADDR:
1721 case GT_LCL_VAR_ADDR:
1722 genCodeForLclAddr(treeNode);
1723 break;
1724
1725 case GT_LCL_FLD:
1726 genCodeForLclFld(treeNode->AsLclFld());
1727 break;
1728
1729 case GT_LCL_VAR:
1730 genCodeForLclVar(treeNode->AsLclVar());
1731 break;
1732
1733 case GT_STORE_LCL_FLD:
1734 genCodeForStoreLclFld(treeNode->AsLclFld());
1735 break;
1736
1737 case GT_STORE_LCL_VAR:
1738 genCodeForStoreLclVar(treeNode->AsLclVar());
1739 break;
1740
1741 case GT_RETFILT:
1742 case GT_RETURN:
1743 genReturn(treeNode);
1744 break;
1745
1746 case GT_LEA:
1747 // If we are here, it is the case where there is an LEA that cannot be folded into a parent instruction.
1748 genLeaInstruction(treeNode->AsAddrMode());
1749 break;
1750
1751 case GT_INDEX_ADDR:
1752 genCodeForIndexAddr(treeNode->AsIndexAddr());
1753 break;
1754
1755 case GT_IND:
1756 genCodeForIndir(treeNode->AsIndir());
1757 break;
1758
1759 case GT_MULHI:
1760#ifdef _TARGET_X86_
1761 case GT_MUL_LONG:
1762#endif
1763 genCodeForMulHi(treeNode->AsOp());
1764 break;
1765
1766 case GT_INTRINSIC:
1767 genIntrinsic(treeNode);
1768 break;
1769
1770#ifdef FEATURE_SIMD
1771 case GT_SIMD:
1772 genSIMDIntrinsic(treeNode->AsSIMD());
1773 break;
1774#endif // FEATURE_SIMD
1775
1776#ifdef FEATURE_HW_INTRINSICS
1777 case GT_HWIntrinsic:
1778 genHWIntrinsic(treeNode->AsHWIntrinsic());
1779 break;
1780#endif // FEATURE_HW_INTRINSICS
1781
1782 case GT_CKFINITE:
1783 genCkfinite(treeNode);
1784 break;
1785
1786 case GT_EQ:
1787 case GT_NE:
1788 case GT_LT:
1789 case GT_LE:
1790 case GT_GE:
1791 case GT_GT:
1792 case GT_TEST_EQ:
1793 case GT_TEST_NE:
1794 case GT_CMP:
1795 genCodeForCompare(treeNode->AsOp());
1796 break;
1797
1798 case GT_JTRUE:
1799 genCodeForJumpTrue(treeNode);
1800 break;
1801
1802 case GT_JCC:
1803 genCodeForJcc(treeNode->AsCC());
1804 break;
1805
1806 case GT_SETCC:
1807 genCodeForSetcc(treeNode->AsCC());
1808 break;
1809
1810 case GT_BT:
1811 genCodeForBT(treeNode->AsOp());
1812 break;
1813
1814 case GT_RETURNTRAP:
1815 genCodeForReturnTrap(treeNode->AsOp());
1816 break;
1817
1818 case GT_STOREIND:
1819 genCodeForStoreInd(treeNode->AsStoreInd());
1820 break;
1821
1822 case GT_COPY:
1823 // This is handled at the time we call genConsumeReg() on the GT_COPY
1824 break;
1825
1826 case GT_LIST:
1827 case GT_FIELD_LIST:
1828 // Should always be marked contained.
1829 assert(!"LIST, FIELD_LIST nodes should always be marked contained.");
1830 break;
1831
1832 case GT_SWAP:
1833 genCodeForSwap(treeNode->AsOp());
1834 break;
1835
1836 case GT_PUTARG_STK:
1837 genPutArgStk(treeNode->AsPutArgStk());
1838 break;
1839
1840 case GT_PUTARG_REG:
1841 genPutArgReg(treeNode->AsOp());
1842 break;
1843
1844 case GT_CALL:
1845 genCallInstruction(treeNode->AsCall());
1846 break;
1847
1848 case GT_JMP:
1849 genJmpMethod(treeNode);
1850 break;
1851
1852 case GT_LOCKADD:
1853 genCodeForLockAdd(treeNode->AsOp());
1854 break;
1855
1856 case GT_XCHG:
1857 case GT_XADD:
1858 genLockedInstructions(treeNode->AsOp());
1859 break;
1860
1861 case GT_MEMORYBARRIER:
1862 instGen_MemoryBarrier();
1863 break;
1864
1865 case GT_CMPXCHG:
1866 genCodeForCmpXchg(treeNode->AsCmpXchg());
1867 break;
1868
1869 case GT_RELOAD:
1870 // do nothing - reload is just a marker.
1871 // The parent node will call genConsumeReg on this which will trigger the unspill of this node's child
1872 // into the register specified in this node.
1873 break;
1874
1875 case GT_NOP:
1876 break;
1877
1878 case GT_NO_OP:
1879 getEmitter()->emitIns_Nop(1);
1880 break;
1881
1882 case GT_ARR_BOUNDS_CHECK:
1883#ifdef FEATURE_SIMD
1884 case GT_SIMD_CHK:
1885#endif // FEATURE_SIMD
1886#ifdef FEATURE_HW_INTRINSICS
1887 case GT_HW_INTRINSIC_CHK:
1888#endif // FEATURE_HW_INTRINSICS
1889 genRangeCheck(treeNode);
1890 break;
1891
1892 case GT_PHYSREG:
1893 genCodeForPhysReg(treeNode->AsPhysReg());
1894 break;
1895
1896 case GT_NULLCHECK:
1897 genCodeForNullCheck(treeNode->AsOp());
1898 break;
1899
1900 case GT_CATCH_ARG:
1901
1902 noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp));
1903
1904 /* Catch arguments get passed in a register. genCodeForBBlist()
1905 would have marked it as holding a GC object, but not used. */
1906
1907 noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT);
1908 genConsumeReg(treeNode);
1909 break;
1910
1911#if !FEATURE_EH_FUNCLETS
1912 case GT_END_LFIN:
1913
1914 // Have to clear the ShadowSP of the nesting level which encloses the finally. Generates:
1915 // mov dword ptr [ebp-0xC], 0 // for some slot of the ShadowSP local var
1916
1917 unsigned finallyNesting;
1918 finallyNesting = treeNode->gtVal.gtVal1;
1919 noway_assert(treeNode->gtVal.gtVal1 < compiler->compHndBBtabCount);
1920 noway_assert(finallyNesting < compiler->compHndBBtabCount);
1921
1922 // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
1923 unsigned filterEndOffsetSlotOffs;
1924 PREFIX_ASSUME(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) >
1925 TARGET_POINTER_SIZE); // below doesn't underflow.
1926 filterEndOffsetSlotOffs =
1927 (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
1928
1929 unsigned curNestingSlotOffs;
1930 curNestingSlotOffs = filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE);
1931 instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar, curNestingSlotOffs);
1932 break;
1933#endif // !FEATURE_EH_FUNCLETS
1934
1935 case GT_PINVOKE_PROLOG:
1936 noway_assert(((gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur) & ~fullIntArgRegMask()) == 0);
1937
1938 // the runtime side requires the codegen here to be consistent
1939 emit->emitDisableRandomNops();
1940 break;
1941
1942 case GT_LABEL:
1943 genPendingCallLabel = genCreateTempLabel();
1944 treeNode->gtLabel.gtLabBB = genPendingCallLabel;
1945 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, genPendingCallLabel, treeNode->gtRegNum);
1946 break;
1947
1948 case GT_STORE_OBJ:
1949 case GT_STORE_DYN_BLK:
1950 case GT_STORE_BLK:
1951 genCodeForStoreBlk(treeNode->AsBlk());
1952 break;
1953
1954 case GT_JMPTABLE:
1955 genJumpTable(treeNode);
1956 break;
1957
1958 case GT_SWITCH_TABLE:
1959 genTableBasedSwitch(treeNode);
1960 break;
1961
1962 case GT_ARR_INDEX:
1963 genCodeForArrIndex(treeNode->AsArrIndex());
1964 break;
1965
1966 case GT_ARR_OFFSET:
1967 genCodeForArrOffset(treeNode->AsArrOffs());
1968 break;
1969
1970 case GT_CLS_VAR_ADDR:
1971 emit->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
1972 genProduceReg(treeNode);
1973 break;
1974
1975#if !defined(_TARGET_64BIT_)
1976 case GT_LONG:
1977 assert(treeNode->isUsedFromReg());
1978 genConsumeRegs(treeNode);
1979 break;
1980#endif
1981
1982 case GT_IL_OFFSET:
1983 // Do nothing; these nodes are simply markers for debug info.
1984 break;
1985
1986 default:
1987 {
1988#ifdef DEBUG
1989 char message[256];
1990 _snprintf_s(message, _countof(message), _TRUNCATE, "NYI: Unimplemented node type %s\n",
1991 GenTree::OpName(treeNode->OperGet()));
1992 NYIRAW(message);
1993#endif
1994 assert(!"Unknown node in codegen");
1995 }
1996 break;
1997 }
1998}
1999
2000//----------------------------------------------------------------------------------
2001// genMultiRegCallStoreToLocal: store multi-reg return value of a call node to a local
2002//
2003// Arguments:
2004// treeNode - Gentree of GT_STORE_LCL_VAR
2005//
2006// Return Value:
2007// None
2008//
2009// Assumption:
2010// The child of store is a multi-reg call node.
2011// genProduceReg() on treeNode is made by caller of this routine.
2012//
2013void CodeGen::genMultiRegCallStoreToLocal(GenTree* treeNode)
2014{
2015 assert(treeNode->OperGet() == GT_STORE_LCL_VAR);
2016
2017#ifdef UNIX_AMD64_ABI
2018 // Structs of size >=9 and <=16 are returned in two return registers on x64 Unix.
2019 assert(varTypeIsStruct(treeNode));
2020
2021 // Assumption: current x64 Unix implementation requires that a multi-reg struct
2022 // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
2023 // being struct promoted.
2024 unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
2025 LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
2026 noway_assert(varDsc->lvIsMultiRegRet);
2027
2028 GenTree* op1 = treeNode->gtGetOp1();
2029 GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
2030 GenTreeCall* call = actualOp1->AsCall();
2031 assert(call->HasMultiRegRetVal());
2032
2033 genConsumeRegs(op1);
2034
2035 ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
2036 assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
2037 unsigned regCount = retTypeDesc->GetReturnRegCount();
2038
2039 if (treeNode->gtRegNum != REG_NA)
2040 {
2041 // Right now the only enregistrable structs supported are SIMD types.
2042 assert(varTypeIsSIMD(treeNode));
2043 assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
2044 assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
2045
2046 // This is a case of two 8-bytes that comprise the operand is in
2047 // two different xmm registers and needs to assembled into a single
2048 // xmm register.
2049 regNumber targetReg = treeNode->gtRegNum;
2050 regNumber reg0 = call->GetRegNumByIdx(0);
2051 regNumber reg1 = call->GetRegNumByIdx(1);
2052
2053 if (op1->IsCopyOrReload())
2054 {
2055 // GT_COPY/GT_RELOAD will have valid reg for those positions
2056 // that need to be copied or reloaded.
2057 regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
2058 if (reloadReg != REG_NA)
2059 {
2060 reg0 = reloadReg;
2061 }
2062
2063 reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
2064 if (reloadReg != REG_NA)
2065 {
2066 reg1 = reloadReg;
2067 }
2068 }
2069
2070 if (targetReg != reg0 && targetReg != reg1)
2071 {
2072 // Copy reg0 into targetReg and let it to be handled by one
2073 // of the cases below.
2074 inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
2075 targetReg = reg0;
2076 }
2077
2078 if (targetReg == reg0)
2079 {
2080 // targeReg[63:0] = targetReg[63:0]
2081 // targetReg[127:64] = reg1[127:64]
2082 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
2083 }
2084 else
2085 {
2086 assert(targetReg == reg1);
2087
2088 // We need two shuffles to achieve this
2089 // First:
2090 // targeReg[63:0] = targetReg[63:0]
2091 // targetReg[127:64] = reg0[63:0]
2092 //
2093 // Second:
2094 // targeReg[63:0] = targetReg[127:64]
2095 // targetReg[127:64] = targetReg[63:0]
2096 //
2097 // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
2098 // and next swap low and high 8-bytes of targetReg to have them
2099 // rearranged in the right order.
2100 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
2101 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
2102 }
2103 }
2104 else
2105 {
2106 // Stack store
2107 int offset = 0;
2108 for (unsigned i = 0; i < regCount; ++i)
2109 {
2110 var_types type = retTypeDesc->GetReturnRegType(i);
2111 regNumber reg = call->GetRegNumByIdx(i);
2112 if (op1->IsCopyOrReload())
2113 {
2114 // GT_COPY/GT_RELOAD will have valid reg for those positions
2115 // that need to be copied or reloaded.
2116 regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
2117 if (reloadReg != REG_NA)
2118 {
2119 reg = reloadReg;
2120 }
2121 }
2122
2123 assert(reg != REG_NA);
2124 getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
2125 offset += genTypeSize(type);
2126 }
2127
2128 varDsc->lvRegNum = REG_STK;
2129 }
2130#elif defined(_TARGET_X86_)
2131 // Longs are returned in two return registers on x86.
2132 assert(varTypeIsLong(treeNode));
2133
2134 // Assumption: current x86 implementation requires that a multi-reg long
2135 // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
2136 // being promoted.
2137 unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
2138 LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
2139 noway_assert(varDsc->lvIsMultiRegRet);
2140
2141 GenTree* op1 = treeNode->gtGetOp1();
2142 GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
2143 GenTreeCall* call = actualOp1->AsCall();
2144 assert(call->HasMultiRegRetVal());
2145
2146 genConsumeRegs(op1);
2147
2148 ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
2149 unsigned regCount = retTypeDesc->GetReturnRegCount();
2150 assert(regCount == MAX_RET_REG_COUNT);
2151
2152 // Stack store
2153 int offset = 0;
2154 for (unsigned i = 0; i < regCount; ++i)
2155 {
2156 var_types type = retTypeDesc->GetReturnRegType(i);
2157 regNumber reg = call->GetRegNumByIdx(i);
2158 if (op1->IsCopyOrReload())
2159 {
2160 // GT_COPY/GT_RELOAD will have valid reg for those positions
2161 // that need to be copied or reloaded.
2162 regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
2163 if (reloadReg != REG_NA)
2164 {
2165 reg = reloadReg;
2166 }
2167 }
2168
2169 assert(reg != REG_NA);
2170 getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
2171 offset += genTypeSize(type);
2172 }
2173
2174 varDsc->lvRegNum = REG_STK;
2175#else // !UNIX_AMD64_ABI && !_TARGET_X86_
2176 assert(!"Unreached");
2177#endif // !UNIX_AMD64_ABI && !_TARGET_X86_
2178}
2179
2180//------------------------------------------------------------------------
2181// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP.
2182//
2183void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
2184{
2185 assert(compiler->compGeneratingProlog);
2186
2187 if (frameSize == 0)
2188 {
2189 return;
2190 }
2191
2192 const target_size_t pageSize = compiler->eeGetPageSize();
2193
2194 if (frameSize == REGSIZE_BYTES)
2195 {
2196 // Frame size is the same as register size.
2197 inst_RV(INS_push, REG_EAX, TYP_I_IMPL);
2198 }
2199 else if (frameSize < pageSize)
2200 {
2201 // Frame size is (0x0008..0x1000)
2202 inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2203 }
2204 else if (frameSize < compiler->getVeryLargeFrameSize())
2205 {
2206 // Frame size is (0x1000..0x3000)
2207
2208 getEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, -(int)pageSize);
2209
2210 if (frameSize >= 0x2000)
2211 {
2212 getEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, -2 * (int)pageSize);
2213 }
2214
2215 inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2216 }
2217 else
2218 {
2219 // Frame size >= 0x3000
2220 assert(frameSize >= compiler->getVeryLargeFrameSize());
2221
2222 // Emit the following sequence to 'tickle' the pages.
2223 // Note it is important that stack pointer not change until this is
2224 // complete since the tickles could cause a stack overflow, and we
2225 // need to be able to crawl the stack afterward (which means the
2226 // stack pointer needs to be known).
2227
2228 bool pushedStubParam = false;
2229 if (compiler->info.compPublishStubParam && (REG_SECRET_STUB_PARAM == initReg))
2230 {
2231 // push register containing the StubParam
2232 inst_RV(INS_push, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
2233 pushedStubParam = true;
2234 }
2235
2236#ifndef _TARGET_UNIX_
2237 instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
2238#endif
2239
2240 //
2241 // Can't have a label inside the ReJIT padding area
2242 //
2243 genPrologPadForReJit();
2244
2245#ifndef _TARGET_UNIX_
2246 // Code size for each instruction. We need this because the
2247 // backward branch is hard-coded with the number of bytes to branch.
2248 // The encoding differs based on the architecture and what register is
2249 // used (namely, using RAX has a smaller encoding).
2250 //
2251 // loop:
2252 // For x86
2253 // test [esp + eax], eax 3
2254 // sub eax, 0x1000 5
2255 // cmp EAX, -frameSize 5
2256 // jge loop 2
2257 //
2258 // For AMD64 using RAX
2259 // test [rsp + rax], rax 4
2260 // sub rax, 0x1000 6
2261 // cmp rax, -frameSize 6
2262 // jge loop 2
2263 //
2264 // For AMD64 using RBP
2265 // test [rsp + rbp], rbp 4
2266 // sub rbp, 0x1000 7
2267 // cmp rbp, -frameSize 7
2268 // jge loop 2
2269
2270 getEmitter()->emitIns_R_ARR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, initReg, 0);
2271 inst_RV_IV(INS_sub, initReg, pageSize, EA_PTRSIZE);
2272 inst_RV_IV(INS_cmp, initReg, -((ssize_t)frameSize), EA_PTRSIZE);
2273
2274 int bytesForBackwardJump;
2275#ifdef _TARGET_AMD64_
2276 assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
2277 bytesForBackwardJump = ((initReg == REG_EAX) ? -18 : -20);
2278#else // !_TARGET_AMD64_
2279 assert(initReg == REG_EAX);
2280 bytesForBackwardJump = -15;
2281#endif // !_TARGET_AMD64_
2282
2283 // Branch backwards to start of loop
2284 inst_IV(INS_jge, bytesForBackwardJump);
2285#else // _TARGET_UNIX_
2286 // Code size for each instruction. We need this because the
2287 // backward branch is hard-coded with the number of bytes to branch.
2288 // The encoding differs based on the architecture and what register is
2289 // used (namely, using RAX has a smaller encoding).
2290 //
2291 // For x86
2292 // lea eax, [esp - frameSize]
2293 // loop:
2294 // lea esp, [esp - pageSize] 7
2295 // test [esp], eax 3
2296 // cmp esp, eax 2
2297 // jge loop 2
2298 // lea rsp, [rbp + frameSize]
2299 //
2300 // For AMD64 using RAX
2301 // lea rax, [rsp - frameSize]
2302 // loop:
2303 // lea rsp, [rsp - pageSize] 8
2304 // test [rsp], rax 4
2305 // cmp rsp, rax 3
2306 // jge loop 2
2307 // lea rsp, [rax + frameSize]
2308 //
2309 // For AMD64 using RBP
2310 // lea rbp, [rsp - frameSize]
2311 // loop:
2312 // lea rsp, [rsp - pageSize] 8
2313 // test [rsp], rbp 4
2314 // cmp rsp, rbp 3
2315 // jge loop 2
2316 // lea rsp, [rbp + frameSize]
2317
2318 int sPageSize = (int)pageSize;
2319
2320 getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, initReg, REG_SPBASE, -((ssize_t)frameSize)); // get frame border
2321
2322 getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -sPageSize);
2323 getEmitter()->emitIns_R_AR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, 0);
2324 inst_RV_RV(INS_cmp, REG_SPBASE, initReg);
2325
2326 int bytesForBackwardJump;
2327#ifdef _TARGET_AMD64_
2328 assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
2329 bytesForBackwardJump = -17;
2330#else // !_TARGET_AMD64_
2331 assert(initReg == REG_EAX);
2332 bytesForBackwardJump = -14;
2333#endif // !_TARGET_AMD64_
2334
2335 inst_IV(INS_jge, bytesForBackwardJump); // Branch backwards to start of loop
2336
2337 getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, initReg, frameSize); // restore stack pointer
2338#endif // _TARGET_UNIX_
2339
2340 *pInitRegZeroed = false; // The initReg does not contain zero
2341
2342 if (pushedStubParam)
2343 {
2344 // pop eax
2345 inst_RV(INS_pop, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
2346 regSet.verifyRegUsed(REG_SECRET_STUB_PARAM);
2347 }
2348
2349 // sub esp, frameSize 6
2350 inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2351 }
2352
2353 compiler->unwindAllocStack(frameSize);
2354
2355 if (!doubleAlignOrFramePointerUsed())
2356 {
2357 psiAdjustStackLevel(frameSize);
2358 }
2359}
2360
2361//------------------------------------------------------------------------
2362// genLclHeap: Generate code for localloc.
2363//
2364// Arguments:
2365// tree - the localloc tree to generate.
2366//
2367// Notes:
2368// Note that for x86, we don't track ESP movements while generating the localloc code.
2369// The ESP tracking is used to report stack pointer-relative GC info, which is not
2370// interesting while doing the localloc construction. Also, for functions with localloc,
2371// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
2372// call arguments.
2373//
2374// For x86, we store the ESP after the localloc is complete in the LocAllocSP
2375// variable. This variable is implicitly reported to the VM in the GC info (its position
2376// is defined by convention relative to other items), and is used by the GC to find the
2377// "base" stack pointer in functions with localloc.
2378//
2379void CodeGen::genLclHeap(GenTree* tree)
2380{
2381 assert(tree->OperGet() == GT_LCLHEAP);
2382 assert(compiler->compLocallocUsed);
2383
2384 GenTree* size = tree->gtOp.gtOp1;
2385 noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
2386
2387 regNumber targetReg = tree->gtRegNum;
2388 regNumber regCnt = REG_NA;
2389 var_types type = genActualType(size->gtType);
2390 emitAttr easz = emitTypeSize(type);
2391 BasicBlock* endLabel = nullptr;
2392
2393#ifdef DEBUG
2394 genStackPointerCheck(compiler->opts.compStackCheckOnRet, compiler->lvaReturnSpCheck);
2395#endif
2396
2397 noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
2398 noway_assert(genStackLevel == 0); // Can't have anything on the stack
2399
2400 unsigned stackAdjustment = 0;
2401 BasicBlock* loop = nullptr;
2402
2403 // compute the amount of memory to allocate to properly STACK_ALIGN.
2404 size_t amount = 0;
2405 if (size->IsCnsIntOrI())
2406 {
2407 // If size is a constant, then it must be contained.
2408 assert(size->isContained());
2409
2410 // If amount is zero then return null in targetReg
2411 amount = size->gtIntCon.gtIconVal;
2412 if (amount == 0)
2413 {
2414 instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
2415 goto BAILOUT;
2416 }
2417
2418 // 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
2419 amount = AlignUp(amount, STACK_ALIGN);
2420 }
2421 else
2422 {
2423 // The localloc requested memory size is non-constant.
2424
2425 // Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
2426 genConsumeRegAndCopy(size, targetReg);
2427 endLabel = genCreateTempLabel();
2428 getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
2429 inst_JMP(EJ_je, endLabel);
2430
2431 // Compute the size of the block to allocate and perform alignment.
2432 // If compInitMem=true, we can reuse targetReg as regcnt,
2433 // since we don't need any internal registers.
2434 if (compiler->info.compInitMem)
2435 {
2436 assert(tree->AvailableTempRegCount() == 0);
2437 regCnt = targetReg;
2438 }
2439 else
2440 {
2441 regCnt = tree->ExtractTempReg();
2442 if (regCnt != targetReg)
2443 {
2444 // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
2445 inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
2446 }
2447 }
2448
2449 // Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
2450 // by code like:
2451 // add reg, 15
2452 // and reg, -16
2453 // However, in the initialized memory case, we need the count of STACK_ALIGN-sized
2454 // elements, not a byte count, after the alignment. So instead of the "and", which
2455 // becomes unnecessary, generate a shift, e.g.:
2456 // add reg, 15
2457 // shr reg, 4
2458
2459 inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
2460
2461 if (compiler->info.compInitMem)
2462 {
2463 // Convert the count from a count of bytes to a loop count. We will loop once per
2464 // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes
2465 // on x64 and Linux/x86.
2466 //
2467 // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
2468 // words per iteration on x64. We will shift off all the stack alignment bits
2469 // added above, so there is no need for an 'and' instruction.
2470
2471 // --- shr regCnt, 2 (or 4) ---
2472 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
2473 }
2474 else
2475 {
2476 // Otherwise, mask off the low bits to align the byte count.
2477 inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
2478 }
2479 }
2480
2481#if FEATURE_FIXED_OUT_ARGS
2482 // If we have an outgoing arg area then we must adjust the SP by popping off the
2483 // outgoing arg area. We will restore it right before we return from this method.
2484 //
2485 // Localloc returns stack space that aligned to STACK_ALIGN bytes. The following
2486 // are the cases that need to be handled:
2487 // i) Method has out-going arg area.
2488 // It is guaranteed that size of out-going arg area is STACK_ALIGN'ed (see fgMorphArgs).
2489 // Therefore, we will pop off the out-going arg area from RSP before allocating the localloc space.
2490 // ii) Method has no out-going arg area.
2491 // Nothing to pop off from the stack.
2492 if (compiler->lvaOutgoingArgSpaceSize > 0)
2493 {
2494 assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain
2495 // aligned
2496 inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
2497 stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
2498 }
2499#endif
2500
2501 if (size->IsCnsIntOrI())
2502 {
2503 // We should reach here only for non-zero, constant size allocations.
2504 assert(amount > 0);
2505 assert((amount % STACK_ALIGN) == 0);
2506 assert((amount % REGSIZE_BYTES) == 0);
2507
2508 // For small allocations we will generate up to six push 0 inline
2509 size_t cntRegSizedWords = amount / REGSIZE_BYTES;
2510 if (cntRegSizedWords <= 6)
2511 {
2512 for (; cntRegSizedWords != 0; cntRegSizedWords--)
2513 {
2514 inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
2515 }
2516 goto ALLOC_DONE;
2517 }
2518
2519 bool doNoInitLessThanOnePageAlloc =
2520 !compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <=
2521
2522#ifdef _TARGET_X86_
2523 bool needRegCntRegister = true;
2524#else // !_TARGET_X86_
2525 bool needRegCntRegister = !doNoInitLessThanOnePageAlloc;
2526#endif // !_TARGET_X86_
2527
2528 if (needRegCntRegister)
2529 {
2530 // If compInitMem=true, we can reuse targetReg as regcnt.
2531 // Since size is a constant, regCnt is not yet initialized.
2532 assert(regCnt == REG_NA);
2533 if (compiler->info.compInitMem)
2534 {
2535 assert(tree->AvailableTempRegCount() == 0);
2536 regCnt = targetReg;
2537 }
2538 else
2539 {
2540 regCnt = tree->ExtractTempReg();
2541 }
2542 }
2543
2544 if (doNoInitLessThanOnePageAlloc)
2545 {
2546 // Since the size is less than a page, simply adjust ESP.
2547 // ESP might already be in the guard page, so we must touch it BEFORE
2548 // the alloc, not after.
2549 CLANG_FORMAT_COMMENT_ANCHOR;
2550
2551#ifdef _TARGET_X86_
2552 // For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment
2553 // to ESP. So do the work in the count register.
2554 // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
2555 // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
2556 // track".
2557 inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL);
2558 getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
2559 inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE);
2560 inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL);
2561#else // !_TARGET_X86_
2562 getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
2563 inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE);
2564#endif // !_TARGET_X86_
2565
2566 goto ALLOC_DONE;
2567 }
2568
2569 // else, "mov regCnt, amount"
2570
2571 if (compiler->info.compInitMem)
2572 {
2573 // When initializing memory, we want 'amount' to be the loop count.
2574 assert((amount % STACK_ALIGN) == 0);
2575 amount /= STACK_ALIGN;
2576 }
2577
2578 genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
2579 }
2580
2581 loop = genCreateTempLabel();
2582 if (compiler->info.compInitMem)
2583 {
2584 // At this point 'regCnt' is set to the number of loop iterations for this loop, if each
2585 // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
2586 // Since we have to zero out the allocated memory AND ensure that RSP is always valid
2587 // by tickling the pages, we will just push 0's on the stack.
2588
2589 assert(genIsValidIntReg(regCnt));
2590
2591 // Loop:
2592 genDefineTempLabel(loop);
2593
2594 static_assert_no_msg((STACK_ALIGN % REGSIZE_BYTES) == 0);
2595 unsigned const count = (STACK_ALIGN / REGSIZE_BYTES);
2596
2597 for (unsigned i = 0; i < count; i++)
2598 {
2599 inst_IV(INS_push_hide, 0); // --- push REG_SIZE bytes of 0
2600 }
2601 // Note that the stack must always be aligned to STACK_ALIGN bytes
2602
2603 // Decrement the loop counter and loop if not done.
2604 inst_RV(INS_dec, regCnt, TYP_I_IMPL);
2605 inst_JMP(EJ_jne, loop);
2606 }
2607 else
2608 {
2609 // At this point 'regCnt' is set to the total number of bytes to localloc.
2610 //
2611 // We don't need to zero out the allocated memory. However, we do have
2612 // to tickle the pages to ensure that ESP is always valid and is
2613 // in sync with the "stack guard page". Note that in the worst
2614 // case ESP is on the last byte of the guard page. Thus you must
2615 // touch ESP+0 first not ESP+x01000.
2616 //
2617 // Another subtlety is that you don't want ESP to be exactly on the
2618 // boundary of the guard page because PUSH is predecrement, thus
2619 // call setup would not touch the guard page but just beyond it
2620 //
2621 // Note that we go through a few hoops so that ESP never points to
2622 // illegal pages at any time during the tickling process
2623 //
2624 // neg REGCNT
2625 // add REGCNT, ESP // reg now holds ultimate ESP
2626 // jb loop // result is smaller than orignial ESP (no wrap around)
2627 // xor REGCNT, REGCNT, // Overflow, pick lowest possible number
2628 // loop:
2629 // test ESP, [ESP+0] // tickle the page
2630 // mov REGTMP, ESP
2631 // sub REGTMP, GetOsPageSize()
2632 // mov ESP, REGTMP
2633 // cmp ESP, REGCNT
2634 // jae loop
2635 //
2636 // mov ESP, REG
2637 // end:
2638 inst_RV(INS_NEG, regCnt, TYP_I_IMPL);
2639 inst_RV_RV(INS_add, regCnt, REG_SPBASE, TYP_I_IMPL);
2640 inst_JMP(EJ_jb, loop);
2641
2642 instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
2643
2644 genDefineTempLabel(loop);
2645
2646 // Tickle the decremented value, and move back to ESP,
2647 // note that it has to be done BEFORE the update of ESP since
2648 // ESP might already be on the guard page. It is OK to leave
2649 // the final value of ESP on the guard page
2650 getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
2651
2652 // This is a harmless trick to avoid the emitter trying to track the
2653 // decrement of the ESP - we do the subtraction in another reg instead
2654 // of adjusting ESP directly.
2655 regNumber regTmp = tree->GetSingleTempReg();
2656
2657 inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
2658 inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
2659 inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
2660
2661 inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL);
2662 inst_JMP(EJ_jae, loop);
2663
2664 // Move the final value to ESP
2665 inst_RV_RV(INS_mov, REG_SPBASE, regCnt);
2666 }
2667
2668ALLOC_DONE:
2669 // Re-adjust SP to allocate out-going arg area
2670 if (stackAdjustment > 0)
2671 {
2672 assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned
2673 inst_RV_IV(INS_sub, REG_SPBASE, stackAdjustment, EA_PTRSIZE);
2674 }
2675
2676 // Return the stackalloc'ed address in result register.
2677 // TargetReg = RSP + stackAdjustment.
2678 getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, targetReg, REG_SPBASE, stackAdjustment);
2679
2680 if (endLabel != nullptr)
2681 {
2682 genDefineTempLabel(endLabel);
2683 }
2684
2685BAILOUT:
2686
2687#ifdef JIT32_GCENCODER
2688 if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
2689 {
2690 getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, 0);
2691 }
2692#endif // JIT32_GCENCODER
2693
2694#if STACK_PROBES
2695 if (compiler->opts.compNeedStackProbes)
2696 {
2697 genGenerateStackProbe();
2698 }
2699#endif
2700
2701#ifdef DEBUG
2702 // Update local variable to reflect the new stack pointer.
2703 if (compiler->opts.compStackCheckOnRet)
2704 {
2705 noway_assert(compiler->lvaReturnSpCheck != 0xCCCCCCCC &&
2706 compiler->lvaTable[compiler->lvaReturnSpCheck].lvDoNotEnregister &&
2707 compiler->lvaTable[compiler->lvaReturnSpCheck].lvOnFrame);
2708 getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnSpCheck, 0);
2709 }
2710#endif
2711
2712 genProduceReg(tree);
2713}
2714
2715void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
2716{
2717 assert(storeBlkNode->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
2718
2719 if (storeBlkNode->OperIs(GT_STORE_OBJ) && storeBlkNode->OperIsCopyBlkOp() && !storeBlkNode->gtBlkOpGcUnsafe)
2720 {
2721 assert(storeBlkNode->AsObj()->gtGcPtrCount != 0);
2722 genCodeForCpObj(storeBlkNode->AsObj());
2723 return;
2724 }
2725
2726#ifdef JIT32_GCENCODER
2727 assert(!storeBlkNode->gtBlkOpGcUnsafe);
2728#else
2729 if (storeBlkNode->gtBlkOpGcUnsafe)
2730 {
2731 getEmitter()->emitDisableGC();
2732 }
2733#endif // JIT32_GCENCODER
2734
2735 bool isCopyBlk = storeBlkNode->OperIsCopyBlkOp();
2736
2737 switch (storeBlkNode->gtBlkOpKind)
2738 {
2739#ifdef _TARGET_AMD64_
2740 case GenTreeBlk::BlkOpKindHelper:
2741 if (isCopyBlk)
2742 {
2743 genCodeForCpBlk(storeBlkNode);
2744 }
2745 else
2746 {
2747 genCodeForInitBlk(storeBlkNode);
2748 }
2749 break;
2750#endif // _TARGET_AMD64_
2751 case GenTreeBlk::BlkOpKindRepInstr:
2752 if (isCopyBlk)
2753 {
2754 genCodeForCpBlkRepMovs(storeBlkNode);
2755 }
2756 else
2757 {
2758 genCodeForInitBlkRepStos(storeBlkNode);
2759 }
2760 break;
2761 case GenTreeBlk::BlkOpKindUnroll:
2762 if (isCopyBlk)
2763 {
2764 genCodeForCpBlkUnroll(storeBlkNode);
2765 }
2766 else
2767 {
2768 genCodeForInitBlkUnroll(storeBlkNode);
2769 }
2770 break;
2771 default:
2772 unreached();
2773 }
2774
2775#ifndef JIT32_GCENCODER
2776 if (storeBlkNode->gtBlkOpGcUnsafe)
2777 {
2778 getEmitter()->emitEnableGC();
2779 }
2780#endif // !defined(JIT32_GCENCODER)
2781}
2782
2783//
2784//------------------------------------------------------------------------
2785// genCodeForInitBlkRepStos: Generate code for InitBlk using rep stos.
2786//
2787// Arguments:
2788// initBlkNode - The Block store for which we are generating code.
2789//
2790// Preconditions:
2791// On x64:
2792// The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
2793// Any value larger than that, we'll use the helper even if both the fill byte and the
2794// size are integer constants.
2795// On x86:
2796// The size must either be a non-constant or less than INITBLK_STOS_LIMIT bytes.
2797//
2798void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
2799{
2800 // Make sure we got the arguments of the initblk/initobj operation in the right registers.
2801 unsigned size = initBlkNode->Size();
2802 GenTree* dstAddr = initBlkNode->Addr();
2803 GenTree* initVal = initBlkNode->Data();
2804 if (initVal->OperIsInitVal())
2805 {
2806 initVal = initVal->gtGetOp1();
2807 }
2808
2809#ifdef DEBUG
2810 assert(dstAddr->isUsedFromReg());
2811 assert(initVal->isUsedFromReg());
2812#ifdef _TARGET_AMD64_
2813 assert(size != 0);
2814#endif
2815 if (initVal->IsCnsIntOrI())
2816 {
2817#ifdef _TARGET_AMD64_
2818 assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
2819#else
2820 // Note that a size of zero means a non-constant size.
2821 assert((size == 0) || (size > CPBLK_UNROLL_LIMIT));
2822#endif
2823 }
2824
2825#endif // DEBUG
2826
2827 genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
2828 instGen(INS_r_stosb);
2829}
2830
2831// Generate code for InitBlk by performing a loop unroll
2832// Preconditions:
2833// a) Both the size and fill byte value are integer constants.
2834// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
2835//
2836void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
2837{
2838 // Make sure we got the arguments of the initblk/initobj operation in the right registers
2839 unsigned size = initBlkNode->Size();
2840 GenTree* dstAddr = initBlkNode->Addr();
2841 GenTree* initVal = initBlkNode->Data();
2842 if (initVal->OperIsInitVal())
2843 {
2844 initVal = initVal->gtGetOp1();
2845 }
2846
2847 assert(dstAddr->isUsedFromReg());
2848 assert(initVal->isUsedFromReg() || (initVal->IsIntegralConst(0) && ((size & 0xf) == 0)));
2849 assert(size != 0);
2850 assert(size <= INITBLK_UNROLL_LIMIT);
2851 assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI());
2852
2853 emitter* emit = getEmitter();
2854
2855 genConsumeOperands(initBlkNode);
2856
2857 // If the initVal was moved, or spilled and reloaded to a different register,
2858 // get the original initVal from below the GT_RELOAD, but only after capturing the valReg,
2859 // which needs to be the new register.
2860 regNumber valReg = initVal->gtRegNum;
2861 initVal = initVal->gtSkipReloadOrCopy();
2862
2863 unsigned offset = 0;
2864
2865 // Perform an unroll using SSE2 loads and stores.
2866 if (size >= XMM_REGSIZE_BYTES)
2867 {
2868 regNumber tmpReg = initBlkNode->GetSingleTempReg();
2869 assert(genIsValidFloatReg(tmpReg));
2870
2871 if (initVal->gtIntCon.gtIconVal != 0)
2872 {
2873 emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, tmpReg, valReg);
2874 emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
2875#ifdef _TARGET_X86_
2876 // For x86, we need one more to convert it from 8 bytes to 16 bytes.
2877 emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
2878#endif // _TARGET_X86_
2879 }
2880 else
2881 {
2882 emit->emitIns_R_R(INS_xorps, EA_8BYTE, tmpReg, tmpReg);
2883 }
2884
2885 // Determine how many 16 byte slots we're going to fill using SSE movs.
2886 size_t slots = size / XMM_REGSIZE_BYTES;
2887
2888 while (slots-- > 0)
2889 {
2890 emit->emitIns_AR_R(INS_movdqu, EA_8BYTE, tmpReg, dstAddr->gtRegNum, offset);
2891 offset += XMM_REGSIZE_BYTES;
2892 }
2893 }
2894
2895 // Fill the remainder (or a < 16 byte sized struct)
2896 if ((size & 8) != 0)
2897 {
2898#ifdef _TARGET_X86_
2899 // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
2900 emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2901 offset += 4;
2902 emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2903 offset += 4;
2904#else // !_TARGET_X86_
2905
2906 emit->emitIns_AR_R(INS_mov, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
2907 offset += 8;
2908
2909#endif // !_TARGET_X86_
2910 }
2911 if ((size & 4) != 0)
2912 {
2913 emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2914 offset += 4;
2915 }
2916 if ((size & 2) != 0)
2917 {
2918 emit->emitIns_AR_R(INS_mov, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
2919 offset += 2;
2920 }
2921 if ((size & 1) != 0)
2922 {
2923 emit->emitIns_AR_R(INS_mov, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
2924 }
2925}
2926
2927// Generates code for InitBlk by calling the VM memset helper function.
2928// Preconditions:
2929// a) The size argument of the InitBlk is not an integer constant.
2930// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes.
2931void CodeGen::genCodeForInitBlk(GenTreeBlk* initBlkNode)
2932{
2933#ifdef _TARGET_AMD64_
2934 // Make sure we got the arguments of the initblk operation in the right registers
2935 unsigned blockSize = initBlkNode->Size();
2936 GenTree* dstAddr = initBlkNode->Addr();
2937 GenTree* initVal = initBlkNode->Data();
2938 if (initVal->OperIsInitVal())
2939 {
2940 initVal = initVal->gtGetOp1();
2941 }
2942
2943 assert(dstAddr->isUsedFromReg());
2944 assert(initVal->isUsedFromReg());
2945
2946 if (blockSize != 0)
2947 {
2948 assert(blockSize >= CPBLK_MOVS_LIMIT);
2949 }
2950
2951 genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
2952
2953 genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
2954#else // !_TARGET_AMD64_
2955 NYI_X86("Helper call for InitBlk");
2956#endif // !_TARGET_AMD64_
2957}
2958
2959// Generate code for a load from some address + offset
2960// baseNode: tree node which can be either a local address or arbitrary node
2961// offset: distance from the baseNode from which to load
2962void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
2963{
2964 emitter* emit = getEmitter();
2965
2966 if (baseNode->OperIsLocalAddr())
2967 {
2968 if (baseNode->gtOper == GT_LCL_FLD_ADDR)
2969 {
2970 offset += baseNode->gtLclFld.gtLclOffs;
2971 }
2972 emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
2973 }
2974 else
2975 {
2976 emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
2977 }
2978}
2979
2980//------------------------------------------------------------------------
2981// genCodeForStoreOffset: Generate code to store a reg to [base + offset].
2982//
2983// Arguments:
2984// ins - the instruction to generate.
2985// size - the size that needs to be stored.
2986// src - the register which needs to be stored.
2987// baseNode - the base, relative to which to store the src register.
2988// offset - the offset that is added to the baseNode to calculate the address to store into.
2989//
2990void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
2991{
2992 emitter* emit = getEmitter();
2993
2994 if (baseNode->OperIsLocalAddr())
2995 {
2996 if (baseNode->gtOper == GT_LCL_FLD_ADDR)
2997 {
2998 offset += baseNode->gtLclFld.gtLclOffs;
2999 }
3000
3001 emit->emitIns_S_R(ins, size, src, baseNode->AsLclVarCommon()->GetLclNum(), offset);
3002 }
3003 else
3004 {
3005 emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
3006 }
3007}
3008
3009// Generates CpBlk code by performing a loop unroll
3010// Preconditions:
3011// The size argument of the CpBlk node is a constant and <= 64 bytes.
3012// This may seem small but covers >95% of the cases in several framework assemblies.
3013//
3014void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
3015{
3016 // Make sure we got the arguments of the cpblk operation in the right registers
3017 unsigned size = cpBlkNode->Size();
3018 GenTree* dstAddr = cpBlkNode->Addr();
3019 GenTree* source = cpBlkNode->Data();
3020 GenTree* srcAddr = nullptr;
3021 assert(size <= CPBLK_UNROLL_LIMIT);
3022
3023 emitter* emit = getEmitter();
3024
3025 if (dstAddr->isUsedFromReg())
3026 {
3027 genConsumeReg(dstAddr);
3028 }
3029
3030 if (source->gtOper == GT_IND)
3031 {
3032 srcAddr = source->gtGetOp1();
3033 if (srcAddr->isUsedFromReg())
3034 {
3035 genConsumeReg(srcAddr);
3036 }
3037 }
3038 else
3039 {
3040 noway_assert(source->IsLocal());
3041 // TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
3042 // OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
3043 if (source->OperGet() == GT_LCL_VAR)
3044 {
3045 source->SetOper(GT_LCL_VAR_ADDR);
3046 }
3047 else
3048 {
3049 assert(source->OperGet() == GT_LCL_FLD);
3050 source->SetOper(GT_LCL_FLD_ADDR);
3051 }
3052 srcAddr = source;
3053 }
3054
3055 unsigned offset = 0;
3056
3057 // If the size of this struct is larger than 16 bytes
3058 // let's use SSE2 to be able to do 16 byte at a time
3059 // loads and stores.
3060
3061 if (size >= XMM_REGSIZE_BYTES)
3062 {
3063 regNumber xmmReg = cpBlkNode->GetSingleTempReg(RBM_ALLFLOAT);
3064 assert(genIsValidFloatReg(xmmReg));
3065 size_t slots = size / XMM_REGSIZE_BYTES;
3066
3067 // TODO: In the below code the load and store instructions are for 16 bytes, but the
3068 // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
3069 // this probably needs to be changed.
3070 while (slots-- > 0)
3071 {
3072 // Load
3073 genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset);
3074 // Store
3075 genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
3076 offset += XMM_REGSIZE_BYTES;
3077 }
3078 }
3079
3080 // Fill the remainder (15 bytes or less) if there's one.
3081 if ((size & 0xf) != 0)
3082 {
3083 // Grab the integer temp register to emit the remaining loads and stores.
3084 regNumber tmpReg = cpBlkNode->GetSingleTempReg(RBM_ALLINT);
3085
3086 if ((size & 8) != 0)
3087 {
3088#ifdef _TARGET_X86_
3089 // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
3090 for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
3091 {
3092 genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
3093 genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
3094 }
3095#else // !_TARGET_X86_
3096 genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset);
3097 genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
3098 offset += 8;
3099#endif // !_TARGET_X86_
3100 }
3101 if ((size & 4) != 0)
3102 {
3103 genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
3104 genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
3105 offset += 4;
3106 }
3107 if ((size & 2) != 0)
3108 {
3109 genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset);
3110 genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
3111 offset += 2;
3112 }
3113 if ((size & 1) != 0)
3114 {
3115 genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset);
3116 genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
3117 }
3118 }
3119}
3120
3121// Generate code for CpBlk by using rep movs
3122// Preconditions:
3123// The size argument of the CpBlk is a constant and is between
3124// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
3125void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
3126{
3127 // Make sure we got the arguments of the cpblk operation in the right registers
3128 unsigned size = cpBlkNode->Size();
3129 GenTree* dstAddr = cpBlkNode->Addr();
3130 GenTree* source = cpBlkNode->Data();
3131 GenTree* srcAddr = nullptr;
3132
3133#ifdef DEBUG
3134 assert(dstAddr->isUsedFromReg());
3135 assert(source->isContained());
3136
3137#ifdef _TARGET_X86_
3138 if (size == 0)
3139 {
3140 noway_assert(cpBlkNode->OperGet() == GT_STORE_DYN_BLK);
3141 }
3142 else
3143#endif
3144 {
3145#ifdef _TARGET_X64_
3146 assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
3147#else
3148 assert(size > CPBLK_UNROLL_LIMIT);
3149#endif
3150 }
3151#endif // DEBUG
3152
3153 genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
3154 instGen(INS_r_movsb);
3155}
3156
3157#ifdef FEATURE_PUT_STRUCT_ARG_STK
3158//------------------------------------------------------------------------
3159// CodeGen::genMove8IfNeeded: Conditionally move 8 bytes of a struct to the argument area
3160//
3161// Arguments:
3162// size - The size of bytes remaining to be moved
3163// longTmpReg - The tmp register to be used for the long value
3164// srcAddr - The address of the source struct
3165// offset - The current offset being copied
3166//
3167// Return Value:
3168// Returns the number of bytes moved (8 or 0).
3169//
3170// Notes:
3171// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3172// not an even multiple of 16.
3173// On x86, longTmpReg must be an xmm reg; on x64 it must be an integer register.
3174// This is checked by genStoreRegToStackArg.
3175//
3176unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree* srcAddr, unsigned offset)
3177{
3178#ifdef _TARGET_X86_
3179 instruction longMovIns = INS_movq;
3180#else // !_TARGET_X86_
3181 instruction longMovIns = INS_mov;
3182#endif // !_TARGET_X86_
3183 if ((size & 8) != 0)
3184 {
3185 genCodeForLoadOffset(longMovIns, EA_8BYTE, longTmpReg, srcAddr, offset);
3186 genStoreRegToStackArg(TYP_LONG, longTmpReg, offset);
3187 return 8;
3188 }
3189 return 0;
3190}
3191
3192//------------------------------------------------------------------------
3193// CodeGen::genMove4IfNeeded: Conditionally move 4 bytes of a struct to the argument area
3194//
3195// Arguments:
3196// size - The size of bytes remaining to be moved
3197// intTmpReg - The tmp register to be used for the long value
3198// srcAddr - The address of the source struct
3199// offset - The current offset being copied
3200//
3201// Return Value:
3202// Returns the number of bytes moved (4 or 0).
3203//
3204// Notes:
3205// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3206// not an even multiple of 16.
3207// intTmpReg must be an integer register.
3208// This is checked by genStoreRegToStackArg.
3209//
3210unsigned CodeGen::genMove4IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3211{
3212 if ((size & 4) != 0)
3213 {
3214 genCodeForLoadOffset(INS_mov, EA_4BYTE, intTmpReg, srcAddr, offset);
3215 genStoreRegToStackArg(TYP_INT, intTmpReg, offset);
3216 return 4;
3217 }
3218 return 0;
3219}
3220
3221//------------------------------------------------------------------------
3222// CodeGen::genMove2IfNeeded: Conditionally move 2 bytes of a struct to the argument area
3223//
3224// Arguments:
3225// size - The size of bytes remaining to be moved
3226// intTmpReg - The tmp register to be used for the long value
3227// srcAddr - The address of the source struct
3228// offset - The current offset being copied
3229//
3230// Return Value:
3231// Returns the number of bytes moved (2 or 0).
3232//
3233// Notes:
3234// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3235// not an even multiple of 16.
3236// intTmpReg must be an integer register.
3237// This is checked by genStoreRegToStackArg.
3238//
3239unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3240{
3241 if ((size & 2) != 0)
3242 {
3243 genCodeForLoadOffset(INS_mov, EA_2BYTE, intTmpReg, srcAddr, offset);
3244 genStoreRegToStackArg(TYP_SHORT, intTmpReg, offset);
3245 return 2;
3246 }
3247 return 0;
3248}
3249
3250//------------------------------------------------------------------------
3251// CodeGen::genMove1IfNeeded: Conditionally move 1 byte of a struct to the argument area
3252//
3253// Arguments:
3254// size - The size of bytes remaining to be moved
3255// intTmpReg - The tmp register to be used for the long value
3256// srcAddr - The address of the source struct
3257// offset - The current offset being copied
3258//
3259// Return Value:
3260// Returns the number of bytes moved (1 or 0).
3261//
3262// Notes:
3263// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3264// not an even multiple of 16.
3265// intTmpReg must be an integer register.
3266// This is checked by genStoreRegToStackArg.
3267//
3268unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3269{
3270 if ((size & 1) != 0)
3271 {
3272 genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
3273 genStoreRegToStackArg(TYP_BYTE, intTmpReg, offset);
3274 return 1;
3275 }
3276 return 0;
3277}
3278
3279//---------------------------------------------------------------------------------------------------------------//
3280// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
3281//
3282// Arguments:
3283// putArgNode - the PutArgStk tree.
3284//
3285// Notes:
3286// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct will be copied to the
3287// stack.
3288//
3289// TODO-Amd64-Unix: Try to share code with copyblk.
3290// Need refactoring of copyblk before it could be used for putarg_stk.
3291// The difference for now is that a putarg_stk contains its children, while cpyblk does not.
3292// This creates differences in code. After some significant refactoring it could be reused.
3293//
3294void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
3295{
3296 GenTree* src = putArgNode->gtOp.gtOp1;
3297 // We will never call this method for SIMD types, which are stored directly
3298 // in genPutStructArgStk().
3299 noway_assert(src->TypeGet() == TYP_STRUCT);
3300
3301 unsigned size = putArgNode->getArgSize();
3302 assert(size <= CPBLK_UNROLL_LIMIT);
3303
3304 emitter* emit = getEmitter();
3305 unsigned putArgOffset = putArgNode->getArgOffset();
3306
3307 assert(src->isContained());
3308
3309 assert(src->gtOper == GT_OBJ);
3310
3311 if (src->gtOp.gtOp1->isUsedFromReg())
3312 {
3313 genConsumeReg(src->gtOp.gtOp1);
3314 }
3315
3316 unsigned offset = 0;
3317
3318 regNumber xmmTmpReg = REG_NA;
3319 regNumber intTmpReg = REG_NA;
3320 regNumber longTmpReg = REG_NA;
3321#ifdef _TARGET_X86_
3322 // On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
3323 // less than 16 bytes, we will just be using pushes
3324 if (size >= 8)
3325 {
3326 xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
3327 longTmpReg = xmmTmpReg;
3328 }
3329 if ((size & 0x7) != 0)
3330 {
3331 intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
3332 }
3333#else // !_TARGET_X86_
3334 // On x64 we use an XMM register only for 16-byte chunks.
3335 if (size >= XMM_REGSIZE_BYTES)
3336 {
3337 xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
3338 }
3339 if ((size & 0xf) != 0)
3340 {
3341 intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
3342 longTmpReg = intTmpReg;
3343 }
3344#endif // !_TARGET_X86_
3345
3346 // If the size of this struct is larger than 16 bytes
3347 // let's use SSE2 to be able to do 16 byte at a time
3348 // loads and stores.
3349 if (size >= XMM_REGSIZE_BYTES)
3350 {
3351#ifdef _TARGET_X86_
3352 assert(!m_pushStkArg);
3353#endif // _TARGET_X86_
3354 size_t slots = size / XMM_REGSIZE_BYTES;
3355
3356 assert(putArgNode->gtGetOp1()->isContained());
3357 assert(putArgNode->gtGetOp1()->gtOp.gtOper == GT_OBJ);
3358
3359 // TODO: In the below code the load and store instructions are for 16 bytes, but the
3360 // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
3361 // this probably needs to be changed.
3362 while (slots-- > 0)
3363 {
3364 // Load
3365 genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
3366
3367 // Store
3368 genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
3369
3370 offset += XMM_REGSIZE_BYTES;
3371 }
3372 }
3373
3374 // Fill the remainder (15 bytes or less) if there's one.
3375 if ((size & 0xf) != 0)
3376 {
3377#ifdef _TARGET_X86_
3378 if (m_pushStkArg)
3379 {
3380 // This case is currently supported only for the case where the total size is
3381 // less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
3382 // order. However, morph has ensured that we have a struct that is an even
3383 // multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
3384 assert(((size & 0xc) == size) && (offset == 0));
3385 // If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
3386 // whether we've got an 8 byte chunk, and then push it on the stack.
3387 unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, size & 0x8);
3388 // Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
3389 // and push it on the stack.
3390 pushedBytes += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, 0);
3391 }
3392 else
3393#endif // _TARGET_X86_
3394 {
3395 offset += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, offset);
3396 offset += genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3397 offset += genMove2IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3398 offset += genMove1IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3399 assert(offset == size);
3400 }
3401 }
3402}
3403
3404//------------------------------------------------------------------------
3405// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
3406//
3407// Arguments:
3408// putArgNode - the PutArgStk tree.
3409//
3410// Preconditions:
3411// The size argument of the PutArgStk (for structs) is a constant and is between
3412// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
3413// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
3414//
3415void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
3416{
3417 GenTree* srcAddr = putArgNode->gtGetOp1();
3418 assert(srcAddr->TypeGet() == TYP_STRUCT);
3419 assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
3420
3421 // Make sure we got the arguments of the cpblk operation in the right registers, and that
3422 // 'srcAddr' is contained as expected.
3423 assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
3424 assert(srcAddr->isContained());
3425
3426 genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
3427 instGen(INS_r_movsb);
3428}
3429
3430//------------------------------------------------------------------------
3431// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
3432// must be cleared to zeroes. The native compiler doesn't clear the upper bits
3433// and there is no way to know if the caller is native or not. So, the upper
3434// 32 bits of Vector argument on stack are always cleared to zero.
3435#if defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
3436void CodeGen::genClearStackVec3ArgUpperBits()
3437{
3438#ifdef DEBUG
3439 if (verbose)
3440 {
3441 printf("*************** In genClearStackVec3ArgUpperBits()\n");
3442 }
3443#endif
3444
3445 assert(compiler->compGeneratingProlog);
3446
3447 unsigned varNum = 0;
3448
3449 for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
3450 {
3451 LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
3452 assert(varDsc->lvIsParam);
3453
3454 // Does var has simd12 type?
3455 if (varDsc->lvType != TYP_SIMD12)
3456 {
3457 continue;
3458 }
3459
3460 if (!varDsc->lvIsRegArg)
3461 {
3462 // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
3463 getEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
3464 }
3465 else
3466 {
3467 // Assume that for x64 linux, an argument is fully in registers
3468 // or fully on stack.
3469 regNumber argReg = varDsc->GetOtherArgReg();
3470
3471 // Clear the upper 32 bits by two shift instructions.
3472 // argReg = argReg << 96
3473 getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
3474 // argReg = argReg >> 96
3475 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
3476 }
3477 }
3478}
3479#endif // defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
3480#endif // FEATURE_PUT_STRUCT_ARG_STK
3481
3482// Generate code for CpObj nodes wich copy structs that have interleaved
3483// GC pointers.
3484// This will generate a sequence of movsp instructions for the cases of non-gc members.
3485// Note that movsp is an alias for movsd on x86 and movsq on x64.
3486// and calls to the BY_REF_ASSIGN helper otherwise.
3487void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
3488{
3489 // Make sure we got the arguments of the cpobj operation in the right registers
3490 GenTree* dstAddr = cpObjNode->Addr();
3491 GenTree* source = cpObjNode->Data();
3492 GenTree* srcAddr = nullptr;
3493 var_types srcAddrType = TYP_BYREF;
3494 bool sourceIsLocal = false;
3495
3496 assert(source->isContained());
3497 if (source->gtOper == GT_IND)
3498 {
3499 srcAddr = source->gtGetOp1();
3500 assert(srcAddr->isUsedFromReg());
3501 }
3502 else
3503 {
3504 noway_assert(source->IsLocal());
3505 sourceIsLocal = true;
3506 }
3507
3508 bool dstOnStack = dstAddr->gtSkipReloadOrCopy()->OperIsLocalAddr();
3509
3510#ifdef DEBUG
3511
3512 assert(dstAddr->isUsedFromReg());
3513
3514 // If the GenTree node has data about GC pointers, this means we're dealing
3515 // with CpObj, so this requires special logic.
3516 assert(cpObjNode->gtGcPtrCount > 0);
3517
3518 // MovSp (alias for movsq on x64 and movsd on x86) instruction is used for copying non-gcref fields
3519 // and it needs src = RSI and dst = RDI.
3520 // Either these registers must not contain lclVars, or they must be dying or marked for spill.
3521 // This is because these registers are incremented as we go through the struct.
3522 if (!sourceIsLocal)
3523 {
3524 GenTree* actualSrcAddr = srcAddr->gtSkipReloadOrCopy();
3525 GenTree* actualDstAddr = dstAddr->gtSkipReloadOrCopy();
3526 unsigned srcLclVarNum = BAD_VAR_NUM;
3527 unsigned dstLclVarNum = BAD_VAR_NUM;
3528 bool isSrcAddrLiveOut = false;
3529 bool isDstAddrLiveOut = false;
3530 if (genIsRegCandidateLocal(actualSrcAddr))
3531 {
3532 srcLclVarNum = actualSrcAddr->AsLclVarCommon()->gtLclNum;
3533 isSrcAddrLiveOut = ((actualSrcAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
3534 }
3535 if (genIsRegCandidateLocal(actualDstAddr))
3536 {
3537 dstLclVarNum = actualDstAddr->AsLclVarCommon()->gtLclNum;
3538 isDstAddrLiveOut = ((actualDstAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
3539 }
3540 assert((actualSrcAddr->gtRegNum != REG_RSI) || !isSrcAddrLiveOut ||
3541 ((srcLclVarNum == dstLclVarNum) && !isDstAddrLiveOut));
3542 assert((actualDstAddr->gtRegNum != REG_RDI) || !isDstAddrLiveOut ||
3543 ((srcLclVarNum == dstLclVarNum) && !isSrcAddrLiveOut));
3544 srcAddrType = srcAddr->TypeGet();
3545 }
3546#endif // DEBUG
3547
3548 // Consume the operands and get them into the right registers.
3549 // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
3550 genConsumeBlockOp(cpObjNode, REG_RDI, REG_RSI, REG_NA);
3551 gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddrType);
3552 gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
3553
3554 unsigned slots = cpObjNode->gtSlots;
3555
3556 // If we can prove it's on the stack we don't need to use the write barrier.
3557 if (dstOnStack)
3558 {
3559 if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
3560 {
3561 // If the destination of the CpObj is on the stack, make sure we allocated
3562 // RCX to emit the movsp (alias for movsd or movsq for 32 and 64 bits respectively).
3563 assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
3564
3565 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
3566 instGen(INS_r_movsp);
3567 }
3568 else
3569 {
3570 // For small structs, it's better to emit a sequence of movsp than to
3571 // emit a rep movsp instruction.
3572 while (slots > 0)
3573 {
3574 instGen(INS_movsp);
3575 slots--;
3576 }
3577 }
3578 }
3579 else
3580 {
3581 BYTE* gcPtrs = cpObjNode->gtGcPtrs;
3582 unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
3583
3584 unsigned i = 0;
3585 while (i < slots)
3586 {
3587 switch (gcPtrs[i])
3588 {
3589 case TYPE_GC_NONE:
3590 // Let's see if we can use rep movsp instead of a sequence of movsp instructions
3591 // to save cycles and code size.
3592 {
3593 unsigned nonGcSlotCount = 0;
3594
3595 do
3596 {
3597 nonGcSlotCount++;
3598 i++;
3599 } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
3600
3601 // If we have a very small contiguous non-gc region, it's better just to
3602 // emit a sequence of movsp instructions
3603 if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
3604 {
3605 while (nonGcSlotCount > 0)
3606 {
3607 instGen(INS_movsp);
3608 nonGcSlotCount--;
3609 }
3610 }
3611 else
3612 {
3613 // Otherwise, we can save code-size and improve CQ by emitting
3614 // rep movsp (alias for movsd/movsq for x86/x64)
3615 assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
3616
3617 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
3618 instGen(INS_r_movsp);
3619 }
3620 }
3621 break;
3622 default:
3623 // We have a GC pointer, call the memory barrier.
3624 genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
3625 gcPtrCount--;
3626 i++;
3627 }
3628 }
3629
3630 assert(gcPtrCount == 0);
3631 }
3632
3633 // Clear the gcInfo for RSI and RDI.
3634 // While we normally update GC info prior to the last instruction that uses them,
3635 // these actually live into the helper call.
3636 gcInfo.gcMarkRegSetNpt(RBM_RSI);
3637 gcInfo.gcMarkRegSetNpt(RBM_RDI);
3638}
3639
3640// Generate code for a CpBlk node by the means of the VM memcpy helper call
3641// Preconditions:
3642// a) The size argument of the CpBlk is not an integer constant
3643// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes.
3644void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode)
3645{
3646#ifdef _TARGET_AMD64_
3647 // Make sure we got the arguments of the cpblk operation in the right registers
3648 unsigned blockSize = cpBlkNode->Size();
3649 GenTree* dstAddr = cpBlkNode->Addr();
3650 GenTree* source = cpBlkNode->Data();
3651 GenTree* srcAddr = nullptr;
3652
3653 // Size goes in arg2
3654 if (blockSize != 0)
3655 {
3656 assert(blockSize >= CPBLK_MOVS_LIMIT);
3657 assert((cpBlkNode->gtRsvdRegs & RBM_ARG_2) != 0);
3658 }
3659 else
3660 {
3661 noway_assert(cpBlkNode->gtOper == GT_STORE_DYN_BLK);
3662 }
3663
3664 // Source address goes in arg1
3665 if (source->gtOper == GT_IND)
3666 {
3667 srcAddr = source->gtGetOp1();
3668 assert(srcAddr->isUsedFromReg());
3669 }
3670 else
3671 {
3672 noway_assert(source->IsLocal());
3673 assert((cpBlkNode->gtRsvdRegs & RBM_ARG_1) != 0);
3674 inst_RV_TT(INS_lea, REG_ARG_1, source, 0, EA_BYREF);
3675 }
3676
3677 genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
3678
3679 genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
3680#else // !_TARGET_AMD64_
3681 noway_assert(false && "Helper call for CpBlk is not needed.");
3682#endif // !_TARGET_AMD64_
3683}
3684
3685// generate code do a switch statement based on a table of ip-relative offsets
3686void CodeGen::genTableBasedSwitch(GenTree* treeNode)
3687{
3688 genConsumeOperands(treeNode->AsOp());
3689 regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum;
3690 regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
3691
3692 regNumber tmpReg = treeNode->GetSingleTempReg();
3693
3694 // load the ip-relative offset (which is relative to start of fgFirstBB)
3695 getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
3696
3697 // add it to the absolute address of fgFirstBB
3698 compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
3699 getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
3700 getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
3701 // jmp baseReg
3702 getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
3703}
3704
3705// emits the table and an instruction to get the address of the first element
3706void CodeGen::genJumpTable(GenTree* treeNode)
3707{
3708 noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
3709 assert(treeNode->OperGet() == GT_JMPTABLE);
3710
3711 unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
3712 BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
3713 unsigned jmpTabOffs;
3714 unsigned jmpTabBase;
3715
3716 jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
3717
3718 jmpTabOffs = 0;
3719
3720 JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
3721
3722 for (unsigned i = 0; i < jumpCount; i++)
3723 {
3724 BasicBlock* target = *jumpTable++;
3725 noway_assert(target->bbFlags & BBF_JMP_TARGET);
3726
3727 JITDUMP(" DD L_M%03u_" FMT_BB "\n", Compiler::s_compMethodsCount, target->bbNum);
3728
3729 getEmitter()->emitDataGenData(i, target);
3730 };
3731
3732 getEmitter()->emitDataGenEnd();
3733
3734 // Access to inline data is 'abstracted' by a special type of static member
3735 // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
3736 // to constant data, not a real static field.
3737 getEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->gtRegNum,
3738 compiler->eeFindJitDataOffs(jmpTabBase), 0);
3739 genProduceReg(treeNode);
3740}
3741
3742//------------------------------------------------------------------------
3743// genCodeForLockAdd: Generate code for a GT_LOCKADD node
3744//
3745// Arguments:
3746// node - the GT_LOCKADD node
3747//
3748void CodeGen::genCodeForLockAdd(GenTreeOp* node)
3749{
3750 assert(node->OperIs(GT_LOCKADD));
3751
3752 GenTree* addr = node->gtGetOp1();
3753 GenTree* data = node->gtGetOp2();
3754 emitAttr size = emitActualTypeSize(data->TypeGet());
3755
3756 assert(addr->isUsedFromReg());
3757 assert(data->isUsedFromReg() || data->isContainedIntOrIImmed());
3758 assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
3759
3760 genConsumeOperands(node);
3761 instGen(INS_lock);
3762
3763 if (data->isContainedIntOrIImmed())
3764 {
3765 int imm = static_cast<int>(data->AsIntCon()->IconValue());
3766 assert(imm == data->AsIntCon()->IconValue());
3767 getEmitter()->emitIns_I_AR(INS_add, size, imm, addr->gtRegNum, 0);
3768 }
3769 else
3770 {
3771 getEmitter()->emitIns_AR_R(INS_add, size, data->gtRegNum, addr->gtRegNum, 0);
3772 }
3773}
3774
3775//------------------------------------------------------------------------
3776// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
3777//
3778// Arguments:
3779// node - the GT_XADD/XCHG node
3780//
3781void CodeGen::genLockedInstructions(GenTreeOp* node)
3782{
3783 assert(node->OperIs(GT_XADD, GT_XCHG));
3784
3785 GenTree* addr = node->gtGetOp1();
3786 GenTree* data = node->gtGetOp2();
3787 emitAttr size = emitTypeSize(node->TypeGet());
3788
3789 assert(addr->isUsedFromReg());
3790 assert(data->isUsedFromReg());
3791 assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
3792
3793 genConsumeOperands(node);
3794
3795 if (node->gtRegNum != data->gtRegNum)
3796 {
3797 // If the destination register is different from the data register then we need
3798 // to first move the data to the target register. Make sure we don't overwrite
3799 // the address, the register allocator should have taken care of this.
3800 assert(node->gtRegNum != addr->gtRegNum);
3801 getEmitter()->emitIns_R_R(INS_mov, size, node->gtRegNum, data->gtRegNum);
3802 }
3803
3804 instruction ins = node->OperIs(GT_XADD) ? INS_xadd : INS_xchg;
3805
3806 // XCHG has an implied lock prefix when the first operand is a memory operand.
3807 if (ins != INS_xchg)
3808 {
3809 instGen(INS_lock);
3810 }
3811
3812 getEmitter()->emitIns_AR_R(ins, size, node->gtRegNum, addr->gtRegNum, 0);
3813 genProduceReg(node);
3814}
3815
3816//------------------------------------------------------------------------
3817// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
3818//
3819// Arguments:
3820// tree - the GT_CMPXCHG node
3821//
3822void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
3823{
3824 assert(tree->OperIs(GT_CMPXCHG));
3825
3826 var_types targetType = tree->TypeGet();
3827 regNumber targetReg = tree->gtRegNum;
3828
3829 GenTree* location = tree->gtOpLocation; // arg1
3830 GenTree* value = tree->gtOpValue; // arg2
3831 GenTree* comparand = tree->gtOpComparand; // arg3
3832
3833 assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
3834 assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
3835
3836 genConsumeReg(location);
3837 genConsumeReg(value);
3838 genConsumeReg(comparand);
3839
3840 // comparand goes to RAX;
3841 // Note that we must issue this move after the genConsumeRegs(), in case any of the above
3842 // have a GT_COPY from RAX.
3843 if (comparand->gtRegNum != REG_RAX)
3844 {
3845 inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
3846 }
3847
3848 // location is Rm
3849 instGen(INS_lock);
3850
3851 getEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
3852
3853 // Result is in RAX
3854 if (targetReg != REG_RAX)
3855 {
3856 inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
3857 }
3858
3859 genProduceReg(tree);
3860}
3861
3862// generate code for BoundsCheck nodes
3863void CodeGen::genRangeCheck(GenTree* oper)
3864{
3865 noway_assert(oper->OperIsBoundsCheck());
3866 GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
3867
3868 GenTree* arrIndex = bndsChk->gtIndex;
3869 GenTree* arrLen = bndsChk->gtArrLen;
3870 GenTree* arrRef = nullptr;
3871 int lenOffset = 0;
3872
3873 GenTree * src1, *src2;
3874 emitJumpKind jmpKind;
3875
3876 genConsumeRegs(arrIndex);
3877 genConsumeRegs(arrLen);
3878
3879 if (arrIndex->isContainedIntOrIImmed())
3880 {
3881 // arrIndex is a contained constant. In this case
3882 // we will generate one of the following
3883 // cmp [mem], immed (if arrLen is a memory op)
3884 // cmp reg, immed (if arrLen is in a reg)
3885 //
3886 // That is arrLen cannot be a contained immed.
3887 assert(!arrLen->isContainedIntOrIImmed());
3888
3889 src1 = arrLen;
3890 src2 = arrIndex;
3891 jmpKind = EJ_jbe;
3892 }
3893 else
3894 {
3895 // arrIndex could either be a contained memory op or a reg
3896 // In this case we will generate one of the following
3897 // cmp [mem], immed (if arrLen is a constant)
3898 // cmp [mem], reg (if arrLen is in a reg)
3899 // cmp reg, immed (if arrIndex is in a reg)
3900 // cmp reg1, reg2 (if arraIndex is in reg1)
3901 // cmp reg, [mem] (if arrLen is a memory op)
3902 //
3903 // That is only one of arrIndex or arrLen can be a memory op.
3904 assert(!arrIndex->isUsedFromMemory() || !arrLen->isUsedFromMemory());
3905
3906 src1 = arrIndex;
3907 src2 = arrLen;
3908 jmpKind = EJ_jae;
3909 }
3910
3911 var_types bndsChkType = src2->TypeGet();
3912#if DEBUG
3913 // Bounds checks can only be 32 or 64 bit sized comparisons.
3914 assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
3915
3916 // The type of the bounds check should always wide enough to compare against the index.
3917 assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
3918#endif // DEBUG
3919
3920 getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(bndsChkType), src1, src2);
3921 genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
3922}
3923
3924//---------------------------------------------------------------------
3925// genCodeForPhysReg - generate code for a GT_PHYSREG node
3926//
3927// Arguments
3928// tree - the GT_PHYSREG node
3929//
3930// Return value:
3931// None
3932//
3933void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
3934{
3935 assert(tree->OperIs(GT_PHYSREG));
3936
3937 var_types targetType = tree->TypeGet();
3938 regNumber targetReg = tree->gtRegNum;
3939
3940 if (targetReg != tree->gtSrcReg)
3941 {
3942 inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
3943 genTransferRegGCState(targetReg, tree->gtSrcReg);
3944 }
3945
3946 genProduceReg(tree);
3947}
3948
3949//---------------------------------------------------------------------
3950// genCodeForNullCheck - generate code for a GT_NULLCHECK node
3951//
3952// Arguments
3953// tree - the GT_NULLCHECK node
3954//
3955// Return value:
3956// None
3957//
3958void CodeGen::genCodeForNullCheck(GenTreeOp* tree)
3959{
3960 assert(tree->OperIs(GT_NULLCHECK));
3961
3962 assert(tree->gtOp1->isUsedFromReg());
3963 regNumber reg = genConsumeReg(tree->gtOp1);
3964 getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
3965}
3966
3967//------------------------------------------------------------------------
3968// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
3969// lower bound for the given dimension.
3970//
3971// Arguments:
3972// elemType - the element type of the array
3973// rank - the rank of the array
3974// dimension - the dimension for which the lower bound offset will be returned.
3975//
3976// Return Value:
3977// The offset.
3978
3979unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
3980{
3981 // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
3982 return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
3983}
3984
3985//------------------------------------------------------------------------
3986// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
3987// size for the given dimension.
3988//
3989// Arguments:
3990// elemType - the element type of the array
3991// rank - the rank of the array
3992// dimension - the dimension for which the lower bound offset will be returned.
3993//
3994// Return Value:
3995// The offset.
3996
3997unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
3998{
3999 // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
4000 return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
4001}
4002
4003//------------------------------------------------------------------------
4004// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
4005// producing the effective index by subtracting the lower bound.
4006//
4007// Arguments:
4008// arrIndex - the node for which we're generating code
4009//
4010// Return Value:
4011// None.
4012//
4013
4014void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
4015{
4016 GenTree* arrObj = arrIndex->ArrObj();
4017 GenTree* indexNode = arrIndex->IndexExpr();
4018
4019 regNumber arrReg = genConsumeReg(arrObj);
4020 regNumber indexReg = genConsumeReg(indexNode);
4021 regNumber tgtReg = arrIndex->gtRegNum;
4022
4023 unsigned dim = arrIndex->gtCurrDim;
4024 unsigned rank = arrIndex->gtArrRank;
4025 var_types elemType = arrIndex->gtArrElemType;
4026
4027 noway_assert(tgtReg != REG_NA);
4028
4029 // Subtract the lower bound for this dimension.
4030 // TODO-XArch-CQ: make this contained if it's an immediate that fits.
4031 if (tgtReg != indexReg)
4032 {
4033 inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
4034 }
4035 getEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
4036 genOffsetOfMDArrayLowerBound(elemType, rank, dim));
4037 getEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
4038 genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
4039 genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
4040
4041 genProduceReg(arrIndex);
4042}
4043
4044//------------------------------------------------------------------------
4045// genCodeForArrOffset: Generates code to compute the flattened array offset for
4046// one dimension of an array reference:
4047// result = (prevDimOffset * dimSize) + effectiveIndex
4048// where dimSize is obtained from the arrObj operand
4049//
4050// Arguments:
4051// arrOffset - the node for which we're generating code
4052//
4053// Return Value:
4054// None.
4055//
4056// Notes:
4057// dimSize and effectiveIndex are always non-negative, the former by design,
4058// and the latter because it has been normalized to be zero-based.
4059
4060void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
4061{
4062 GenTree* offsetNode = arrOffset->gtOffset;
4063 GenTree* indexNode = arrOffset->gtIndex;
4064 GenTree* arrObj = arrOffset->gtArrObj;
4065
4066 regNumber tgtReg = arrOffset->gtRegNum;
4067 assert(tgtReg != REG_NA);
4068
4069 unsigned dim = arrOffset->gtCurrDim;
4070 unsigned rank = arrOffset->gtArrRank;
4071 var_types elemType = arrOffset->gtArrElemType;
4072
4073 // First, consume the operands in the correct order.
4074 regNumber offsetReg = REG_NA;
4075 regNumber tmpReg = REG_NA;
4076 if (!offsetNode->IsIntegralConst(0))
4077 {
4078 offsetReg = genConsumeReg(offsetNode);
4079
4080 // We will use a temp register for the offset*scale+effectiveIndex computation.
4081 tmpReg = arrOffset->GetSingleTempReg();
4082 }
4083 else
4084 {
4085 assert(offsetNode->isContained());
4086 }
4087 regNumber indexReg = genConsumeReg(indexNode);
4088 // Although arrReg may not be used in the constant-index case, if we have generated
4089 // the value into a register, we must consume it, otherwise we will fail to end the
4090 // live range of the gc ptr.
4091 // TODO-CQ: Currently arrObj will always have a register allocated to it.
4092 // We could avoid allocating a register for it, which would be of value if the arrObj
4093 // is an on-stack lclVar.
4094 regNumber arrReg = REG_NA;
4095 if (arrObj->gtHasReg())
4096 {
4097 arrReg = genConsumeReg(arrObj);
4098 }
4099
4100 if (!offsetNode->IsIntegralConst(0))
4101 {
4102 assert(tmpReg != REG_NA);
4103 assert(arrReg != REG_NA);
4104
4105 // Evaluate tgtReg = offsetReg*dim_size + indexReg.
4106 // tmpReg is used to load dim_size and the result of the multiplication.
4107 // Note that dim_size will never be negative.
4108
4109 getEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
4110 genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
4111 inst_RV_RV(INS_imul, tmpReg, offsetReg);
4112
4113 if (tmpReg == tgtReg)
4114 {
4115 inst_RV_RV(INS_add, tmpReg, indexReg);
4116 }
4117 else
4118 {
4119 if (indexReg != tgtReg)
4120 {
4121 inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
4122 }
4123 inst_RV_RV(INS_add, tgtReg, tmpReg);
4124 }
4125 }
4126 else
4127 {
4128 if (indexReg != tgtReg)
4129 {
4130 inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
4131 }
4132 }
4133 genProduceReg(arrOffset);
4134}
4135
4136instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
4137{
4138 instruction ins;
4139
4140 // Operations on SIMD vectors shouldn't come this path
4141 assert(!varTypeIsSIMD(type));
4142 if (varTypeIsFloating(type))
4143 {
4144 return ins_MathOp(oper, type);
4145 }
4146
4147 switch (oper)
4148 {
4149 case GT_ADD:
4150 ins = INS_add;
4151 break;
4152 case GT_AND:
4153 ins = INS_and;
4154 break;
4155 case GT_LSH:
4156 ins = INS_shl;
4157 break;
4158 case GT_MUL:
4159 ins = INS_imul;
4160 break;
4161 case GT_NEG:
4162 ins = INS_neg;
4163 break;
4164 case GT_NOT:
4165 ins = INS_not;
4166 break;
4167 case GT_OR:
4168 ins = INS_or;
4169 break;
4170 case GT_ROL:
4171 ins = INS_rol;
4172 break;
4173 case GT_ROR:
4174 ins = INS_ror;
4175 break;
4176 case GT_RSH:
4177 ins = INS_sar;
4178 break;
4179 case GT_RSZ:
4180 ins = INS_shr;
4181 break;
4182 case GT_SUB:
4183 ins = INS_sub;
4184 break;
4185 case GT_XOR:
4186 ins = INS_xor;
4187 break;
4188#if !defined(_TARGET_64BIT_)
4189 case GT_ADD_LO:
4190 ins = INS_add;
4191 break;
4192 case GT_ADD_HI:
4193 ins = INS_adc;
4194 break;
4195 case GT_SUB_LO:
4196 ins = INS_sub;
4197 break;
4198 case GT_SUB_HI:
4199 ins = INS_sbb;
4200 break;
4201 case GT_LSH_HI:
4202 ins = INS_shld;
4203 break;
4204 case GT_RSH_LO:
4205 ins = INS_shrd;
4206 break;
4207#endif // !defined(_TARGET_64BIT_)
4208 default:
4209 unreached();
4210 break;
4211 }
4212 return ins;
4213}
4214
4215//------------------------------------------------------------------------
4216// genCodeForShift: Generates the code sequence for a GenTree node that
4217// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
4218//
4219// Arguments:
4220// tree - the bit shift node (that specifies the type of bit shift to perform).
4221//
4222// Assumptions:
4223// a) All GenTrees are register allocated.
4224// b) The shift-by-amount in tree->gtOp.gtOp2 is either a contained constant or
4225// it's a register-allocated expression. If it is in a register that is
4226// not RCX, it will be moved to RCX (so RCX better not be in use!).
4227//
4228void CodeGen::genCodeForShift(GenTree* tree)
4229{
4230 // Only the non-RMW case here.
4231 assert(tree->OperIsShiftOrRotate());
4232 assert(tree->gtOp.gtOp1->isUsedFromReg());
4233 assert(tree->gtRegNum != REG_NA);
4234
4235 genConsumeOperands(tree->AsOp());
4236
4237 var_types targetType = tree->TypeGet();
4238 instruction ins = genGetInsForOper(tree->OperGet(), targetType);
4239
4240 GenTree* operand = tree->gtGetOp1();
4241 regNumber operandReg = operand->gtRegNum;
4242
4243 GenTree* shiftBy = tree->gtGetOp2();
4244
4245 if (shiftBy->isContainedIntOrIImmed())
4246 {
4247 // First, move the operand to the destination register and
4248 // later on perform the shift in-place.
4249 // (LSRA will try to avoid this situation through preferencing.)
4250 if (tree->gtRegNum != operandReg)
4251 {
4252 inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
4253 }
4254
4255 int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
4256 inst_RV_SH(ins, emitTypeSize(tree), tree->gtRegNum, shiftByValue);
4257 }
4258 else
4259 {
4260 // We must have the number of bits to shift stored in ECX, since we constrained this node to
4261 // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
4262 // register destination requirement.
4263 genCopyRegIfNeeded(shiftBy, REG_RCX);
4264
4265 // The operand to be shifted must not be in ECX
4266 noway_assert(operandReg != REG_RCX);
4267
4268 if (tree->gtRegNum != operandReg)
4269 {
4270 inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
4271 }
4272 inst_RV_CL(ins, tree->gtRegNum, targetType);
4273 }
4274
4275 genProduceReg(tree);
4276}
4277
4278#ifdef _TARGET_X86_
4279//------------------------------------------------------------------------
4280// genCodeForShiftLong: Generates the code sequence for a GenTree node that
4281// represents a three operand bit shift or rotate operation (<<Hi, >>Lo).
4282//
4283// Arguments:
4284// tree - the bit shift node (that specifies the type of bit shift to perform).
4285//
4286// Assumptions:
4287// a) All GenTrees are register allocated.
4288// b) The shift-by-amount in tree->gtOp.gtOp2 is a contained constant
4289//
4290// TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
4291// need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
4292// targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
4293// contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
4294//
4295void CodeGen::genCodeForShiftLong(GenTree* tree)
4296{
4297 // Only the non-RMW case here.
4298 genTreeOps oper = tree->OperGet();
4299 assert(oper == GT_LSH_HI || oper == GT_RSH_LO);
4300
4301 GenTree* operand = tree->gtOp.gtOp1;
4302 assert(operand->OperGet() == GT_LONG);
4303 assert(operand->gtOp.gtOp1->isUsedFromReg());
4304 assert(operand->gtOp.gtOp2->isUsedFromReg());
4305
4306 GenTree* operandLo = operand->gtGetOp1();
4307 GenTree* operandHi = operand->gtGetOp2();
4308
4309 regNumber regLo = operandLo->gtRegNum;
4310 regNumber regHi = operandHi->gtRegNum;
4311
4312 genConsumeOperands(tree->AsOp());
4313
4314 var_types targetType = tree->TypeGet();
4315 instruction ins = genGetInsForOper(oper, targetType);
4316
4317 GenTree* shiftBy = tree->gtGetOp2();
4318
4319 assert(shiftBy->isContainedIntOrIImmed());
4320
4321 unsigned int count = shiftBy->AsIntConCommon()->IconValue();
4322
4323 regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
4324
4325 if (regResult != tree->gtRegNum)
4326 {
4327 inst_RV_RV(INS_mov, tree->gtRegNum, regResult, targetType);
4328 }
4329
4330 if (oper == GT_LSH_HI)
4331 {
4332 inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regLo, count);
4333 }
4334 else
4335 {
4336 assert(oper == GT_RSH_LO);
4337 inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regHi, count);
4338 }
4339
4340 genProduceReg(tree);
4341}
4342#endif
4343
4344//------------------------------------------------------------------------
4345// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
4346// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
4347// GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
4348//
4349// Arguments:
4350// storeIndNode: the GT_STOREIND node.
4351//
4352void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
4353{
4354 GenTree* data = storeInd->Data();
4355 GenTree* addr = storeInd->Addr();
4356
4357 assert(data->OperIsShift() || data->OperIsRotate());
4358
4359 // This function only handles the RMW case.
4360 assert(data->gtOp.gtOp1->isUsedFromMemory());
4361 assert(data->gtOp.gtOp1->isIndir());
4362 assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
4363 assert(data->gtRegNum == REG_NA);
4364
4365 var_types targetType = data->TypeGet();
4366 genTreeOps oper = data->OperGet();
4367 instruction ins = genGetInsForOper(oper, targetType);
4368 emitAttr attr = EA_ATTR(genTypeSize(targetType));
4369
4370 GenTree* shiftBy = data->gtOp.gtOp2;
4371 if (shiftBy->isContainedIntOrIImmed())
4372 {
4373 int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
4374 ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
4375 if (shiftByValue == 1)
4376 {
4377 // There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
4378 getEmitter()->emitInsRMW(ins, attr, storeInd);
4379 }
4380 else
4381 {
4382 getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
4383 }
4384 }
4385 else
4386 {
4387 // We must have the number of bits to shift stored in ECX, since we constrained this node to
4388 // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
4389 // register destination requirement.
4390 regNumber shiftReg = shiftBy->gtRegNum;
4391 genCopyRegIfNeeded(shiftBy, REG_RCX);
4392
4393 // The shiftBy operand is implicit, so call the unary version of emitInsRMW.
4394 getEmitter()->emitInsRMW(ins, attr, storeInd);
4395 }
4396}
4397
4398//------------------------------------------------------------------------
4399// genCodeForLclAddr: Generates the code for GT_LCL_FLD_ADDR/GT_LCL_VAR_ADDR.
4400//
4401// Arguments:
4402// tree - the node.
4403//
4404void CodeGen::genCodeForLclAddr(GenTree* tree)
4405{
4406 assert(tree->OperIs(GT_LCL_FLD_ADDR, GT_LCL_VAR_ADDR));
4407
4408 var_types targetType = tree->TypeGet();
4409 regNumber targetReg = tree->gtRegNum;
4410
4411 // Address of a local var.
4412 noway_assert(targetType == TYP_BYREF);
4413
4414 inst_RV_TT(INS_lea, targetReg, tree, 0, EA_BYREF);
4415 genProduceReg(tree);
4416}
4417
4418//------------------------------------------------------------------------
4419// genCodeForLclFld: Produce code for a GT_LCL_FLD node.
4420//
4421// Arguments:
4422// tree - the GT_LCL_FLD node
4423//
4424void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
4425{
4426 assert(tree->OperIs(GT_LCL_FLD));
4427
4428 var_types targetType = tree->TypeGet();
4429 regNumber targetReg = tree->gtRegNum;
4430
4431 noway_assert(targetReg != REG_NA);
4432
4433#ifdef FEATURE_SIMD
4434 // Loading of TYP_SIMD12 (i.e. Vector3) field
4435 if (targetType == TYP_SIMD12)
4436 {
4437 genLoadLclTypeSIMD12(tree);
4438 return;
4439 }
4440#endif
4441
4442 noway_assert(targetType != TYP_STRUCT);
4443
4444 emitAttr size = emitTypeSize(targetType);
4445 unsigned offs = tree->gtLclOffs;
4446 unsigned varNum = tree->gtLclNum;
4447 assert(varNum < compiler->lvaCount);
4448
4449 getEmitter()->emitIns_R_S(ins_Load(targetType), size, targetReg, varNum, offs);
4450
4451 genProduceReg(tree);
4452}
4453
4454//------------------------------------------------------------------------
4455// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
4456//
4457// Arguments:
4458// tree - the GT_LCL_VAR node
4459//
4460void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
4461{
4462 assert(tree->OperIs(GT_LCL_VAR));
4463
4464 // lcl_vars are not defs
4465 assert((tree->gtFlags & GTF_VAR_DEF) == 0);
4466
4467 bool isRegCandidate = compiler->lvaTable[tree->gtLclNum].lvIsRegCandidate();
4468
4469 // If this is a register candidate that has been spilled, genConsumeReg() will
4470 // reload it at the point of use. Otherwise, if it's not in a register, we load it here.
4471
4472 if (!isRegCandidate && !(tree->gtFlags & GTF_SPILLED))
4473 {
4474#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
4475 // Loading of TYP_SIMD12 (i.e. Vector3) variable
4476 if (tree->TypeGet() == TYP_SIMD12)
4477 {
4478 genLoadLclTypeSIMD12(tree);
4479 return;
4480 }
4481#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
4482
4483 getEmitter()->emitIns_R_S(ins_Load(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(tree->gtLclNum)),
4484 emitTypeSize(tree), tree->gtRegNum, tree->gtLclNum, 0);
4485 genProduceReg(tree);
4486 }
4487}
4488
4489//------------------------------------------------------------------------
4490// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
4491//
4492// Arguments:
4493// tree - the GT_STORE_LCL_FLD node
4494//
4495void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
4496{
4497 assert(tree->OperIs(GT_STORE_LCL_FLD));
4498
4499 var_types targetType = tree->TypeGet();
4500 noway_assert(targetType != TYP_STRUCT);
4501 assert(!varTypeIsFloating(targetType) || (targetType == tree->gtOp1->TypeGet()));
4502
4503#ifdef FEATURE_SIMD
4504 // storing of TYP_SIMD12 (i.e. Vector3) field
4505 if (tree->TypeGet() == TYP_SIMD12)
4506 {
4507 genStoreLclTypeSIMD12(tree);
4508 return;
4509 }
4510#endif // FEATURE_SIMD
4511
4512 GenTree* op1 = tree->gtGetOp1();
4513 genConsumeRegs(op1);
4514 getEmitter()->emitInsBinary(ins_Store(targetType), emitTypeSize(tree), tree, op1);
4515
4516 genUpdateLife(tree);
4517}
4518
4519//------------------------------------------------------------------------
4520// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
4521//
4522// Arguments:
4523// tree - the GT_STORE_LCL_VAR node
4524//
4525void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* tree)
4526{
4527 assert(tree->OperIs(GT_STORE_LCL_VAR));
4528
4529 var_types targetType = tree->TypeGet();
4530 regNumber targetReg = tree->gtRegNum;
4531 emitter* emit = getEmitter();
4532
4533 GenTree* op1 = tree->gtGetOp1();
4534
4535 // var = call, where call returns a multi-reg return value
4536 // case is handled separately.
4537 if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
4538 {
4539 genMultiRegCallStoreToLocal(tree);
4540 }
4541 else
4542 {
4543 noway_assert(targetType != TYP_STRUCT);
4544 assert(!varTypeIsFloating(targetType) || (targetType == op1->TypeGet()));
4545
4546 unsigned lclNum = tree->gtLclNum;
4547 LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
4548
4549 // Ensure that lclVar nodes are typed correctly.
4550 assert(!varDsc->lvNormalizeOnStore() || (targetType == genActualType(varDsc->TypeGet())));
4551
4552#if !defined(_TARGET_64BIT_)
4553 if (targetType == TYP_LONG)
4554 {
4555 genStoreLongLclVar(tree);
4556 return;
4557 }
4558#endif // !defined(_TARGET_64BIT_)
4559
4560#ifdef FEATURE_SIMD
4561 // storing of TYP_SIMD12 (i.e. Vector3) field
4562 if (targetType == TYP_SIMD12)
4563 {
4564 genStoreLclTypeSIMD12(tree);
4565 return;
4566 }
4567
4568 if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
4569 {
4570 // This is only possible for a zero-init.
4571 noway_assert(op1->IsIntegralConst(0));
4572 genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
4573 genProduceReg(tree);
4574 return;
4575 }
4576#endif // FEATURE_SIMD
4577
4578 genConsumeRegs(op1);
4579
4580 if (targetReg == REG_NA)
4581 {
4582 // stack store
4583 emit->emitInsStoreLcl(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
4584 emitTypeSize(targetType), tree);
4585 varDsc->lvRegNum = REG_STK;
4586 }
4587 else
4588 {
4589 // Look for the case where we have a constant zero which we've marked for reuse,
4590 // but which isn't actually in the register we want. In that case, it's better to create
4591 // zero in the target register, because an xor is smaller than a copy. Note that we could
4592 // potentially handle this in the register allocator, but we can't always catch it there
4593 // because the target may not have a register allocated for it yet.
4594 if (op1->isUsedFromReg() && (op1->gtRegNum != targetReg) && (op1->IsIntegralConst(0) || op1->IsFPZero()))
4595 {
4596 op1->gtRegNum = REG_NA;
4597 op1->ResetReuseRegVal();
4598 op1->SetContained();
4599 }
4600
4601 if (!op1->isUsedFromReg())
4602 {
4603 // Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
4604 // must be a constant. However, in the future we might want to support an operand used from
4605 // memory. This is a bit tricky because we have to decide it can be used from memory before
4606 // register allocation,
4607 // and this would be a case where, once that's done, we need to mark that node as always
4608 // requiring a register - which we always assume now anyway, but once we "optimize" that
4609 // we'll have to take cases like this into account.
4610 assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
4611 genSetRegToConst(targetReg, targetType, op1);
4612 }
4613 else if (op1->gtRegNum != targetReg)
4614 {
4615 assert(op1->gtRegNum != REG_NA);
4616 emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(tree), tree, op1);
4617 }
4618 }
4619 }
4620
4621 if (targetReg != REG_NA)
4622 {
4623 genProduceReg(tree);
4624 }
4625}
4626
4627//------------------------------------------------------------------------
4628// genCodeForIndexAddr: Produce code for a GT_INDEX_ADDR node.
4629//
4630// Arguments:
4631// tree - the GT_INDEX_ADDR node
4632//
4633void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
4634{
4635 GenTree* const base = node->Arr();
4636 GenTree* const index = node->Index();
4637
4638 genConsumeReg(base);
4639 genConsumeReg(index);
4640
4641 // NOTE: `genConsumeReg` marks the consumed register as not a GC pointer, as it assumes that the input registers
4642 // die at the first instruction generated by the node. This is not the case for `INDEX_ADDR`, however, as the
4643 // base register is multiply-used. As such, we need to mark the base register as containing a GC pointer until
4644 // we are finished generating the code for this node.
4645
4646 gcInfo.gcMarkRegPtrVal(base->gtRegNum, base->TypeGet());
4647 assert(!varTypeIsGC(index->TypeGet()));
4648
4649 regNumber tmpReg = REG_NA;
4650
4651 // Generate the bounds check if necessary.
4652 if ((node->gtFlags & GTF_INX_RNGCHK) != 0)
4653 {
4654 // Create a GT_IND(GT_LEA)) tree for the array length access.
4655 GenTreeAddrMode arrLenAddr(base->TypeGet(), base, nullptr, 0, node->gtLenOffset);
4656 arrLenAddr.gtRegNum = REG_NA;
4657 arrLenAddr.SetContained();
4658
4659 GenTreeIndir arrLen = indirForm(TYP_INT, &arrLenAddr);
4660
4661#ifdef _TARGET_64BIT_
4662 // The CLI Spec allows an array to be indexed by either an int32 or a native int. In the case that the index
4663 // is a native int on a 64-bit platform, we will need to widen the array length and the compare.
4664 if (index->TypeGet() == TYP_I_IMPL)
4665 {
4666 // Load the array length into a register.
4667 tmpReg = node->GetSingleTempReg();
4668 arrLen.gtRegNum = tmpReg;
4669 arrLen.ClearContained();
4670 getEmitter()->emitInsLoadInd(ins_Load(TYP_INT), EA_4BYTE, arrLen.gtRegNum, &arrLen);
4671 }
4672 else
4673#endif
4674 {
4675 assert(varTypeIsIntegral(index->TypeGet()));
4676
4677 arrLen.gtRegNum = REG_NA;
4678 arrLen.SetContained();
4679 }
4680
4681 // Generate the range check.
4682 getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_I_IMPL), index, &arrLen);
4683 genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL, node->gtIndRngFailBB);
4684 }
4685
4686 // Compute the address of the array element.
4687 switch (node->gtElemSize)
4688 {
4689 case 1:
4690 case 2:
4691 case 4:
4692 case 8:
4693 getEmitter()->emitIns_R_ARX(INS_lea, emitTypeSize(node), node->gtRegNum, base->gtRegNum, index->gtRegNum,
4694 node->gtElemSize, static_cast<int>(node->gtElemOffset));
4695 break;
4696
4697 default:
4698 {
4699 // Multiply the index by the element size.
4700 //
4701 // TODO-CQ: this should really just use `imul index, index, #gtElemSize`
4702 tmpReg = (tmpReg == REG_NA) ? node->GetSingleTempReg() : tmpReg;
4703 CodeGen::genSetRegToIcon(tmpReg, (ssize_t)node->gtElemSize, TYP_INT);
4704 inst_RV_RV(INS_imul, tmpReg, index->gtRegNum);
4705 getEmitter()->emitIns_R_ARX(INS_lea, emitTypeSize(node), node->gtRegNum, base->gtRegNum, tmpReg, 1,
4706 static_cast<int>(node->gtElemOffset));
4707 break;
4708 }
4709 }
4710
4711 gcInfo.gcMarkRegSetNpt(base->gtGetRegMask());
4712
4713 genProduceReg(node);
4714}
4715
4716//------------------------------------------------------------------------
4717// genCodeForIndir: Produce code for a GT_IND node.
4718//
4719// Arguments:
4720// tree - the GT_IND node
4721//
4722void CodeGen::genCodeForIndir(GenTreeIndir* tree)
4723{
4724 assert(tree->OperIs(GT_IND));
4725
4726#ifdef FEATURE_SIMD
4727 // Handling of Vector3 type values loaded through indirection.
4728 if (tree->TypeGet() == TYP_SIMD12)
4729 {
4730 genLoadIndTypeSIMD12(tree);
4731 return;
4732 }
4733#endif // FEATURE_SIMD
4734
4735 var_types targetType = tree->TypeGet();
4736 emitter* emit = getEmitter();
4737
4738 GenTree* addr = tree->Addr();
4739 if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
4740 {
4741 noway_assert(EA_ATTR(genTypeSize(targetType)) == EA_PTRSIZE);
4742 emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, tree->gtRegNum, FLD_GLOBAL_FS,
4743 (int)addr->gtIntCon.gtIconVal);
4744 }
4745 else
4746 {
4747 genConsumeAddress(addr);
4748 emit->emitInsLoadInd(ins_Load(targetType), emitTypeSize(tree), tree->gtRegNum, tree);
4749 }
4750
4751 genProduceReg(tree);
4752}
4753
4754//------------------------------------------------------------------------
4755// genRegCopy: Produce code for a GT_COPY node.
4756//
4757// Arguments:
4758// tree - the GT_COPY node
4759//
4760// Notes:
4761// This will copy the register(s) produced by this nodes source, to
4762// the register(s) allocated to this GT_COPY node.
4763// It has some special handling for these casess:
4764// - when the source and target registers are in different register files
4765// (note that this is *not* a conversion).
4766// - when the source is a lclVar whose home location is being moved to a new
4767// register (rather than just being copied for temporary use).
4768//
4769void CodeGen::genRegCopy(GenTree* treeNode)
4770{
4771 assert(treeNode->OperGet() == GT_COPY);
4772 GenTree* op1 = treeNode->gtOp.gtOp1;
4773
4774 if (op1->IsMultiRegNode())
4775 {
4776 genConsumeReg(op1);
4777
4778 GenTreeCopyOrReload* copyTree = treeNode->AsCopyOrReload();
4779 unsigned regCount = treeNode->GetMultiRegCount();
4780
4781 for (unsigned i = 0; i < regCount; ++i)
4782 {
4783 var_types type = op1->GetRegTypeByIndex(i);
4784 regNumber fromReg = op1->GetRegByIndex(i);
4785 regNumber toReg = copyTree->GetRegNumByIdx(i);
4786
4787 // A Multi-reg GT_COPY node will have a valid reg only for those positions for which a corresponding
4788 // result reg of the multi-reg node needs to be copied.
4789 if (toReg != REG_NA)
4790 {
4791 assert(toReg != fromReg);
4792 inst_RV_RV(ins_Copy(type), toReg, fromReg, type);
4793 }
4794 }
4795 }
4796 else
4797 {
4798 var_types targetType = treeNode->TypeGet();
4799 regNumber targetReg = treeNode->gtRegNum;
4800 assert(targetReg != REG_NA);
4801
4802 // Check whether this node and the node from which we're copying the value have
4803 // different register types. This can happen if (currently iff) we have a SIMD
4804 // vector type that fits in an integer register, in which case it is passed as
4805 // an argument, or returned from a call, in an integer register and must be
4806 // copied if it's in an xmm register.
4807
4808 bool srcFltReg = (varTypeIsFloating(op1) || varTypeIsSIMD(op1));
4809 bool tgtFltReg = (varTypeIsFloating(treeNode) || varTypeIsSIMD(treeNode));
4810 if (srcFltReg != tgtFltReg)
4811 {
4812 instruction ins;
4813 regNumber fpReg;
4814 regNumber intReg;
4815 if (tgtFltReg)
4816 {
4817 ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
4818 fpReg = targetReg;
4819 intReg = op1->gtRegNum;
4820 }
4821 else
4822 {
4823 ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
4824 intReg = targetReg;
4825 fpReg = op1->gtRegNum;
4826 }
4827 inst_RV_RV(ins, fpReg, intReg, targetType);
4828 }
4829 else
4830 {
4831 inst_RV_RV(ins_Copy(targetType), targetReg, genConsumeReg(op1), targetType);
4832 }
4833
4834 if (op1->IsLocal())
4835 {
4836 // The lclVar will never be a def.
4837 // If it is a last use, the lclVar will be killed by genConsumeReg(), as usual, and genProduceReg will
4838 // appropriately set the gcInfo for the copied value.
4839 // If not, there are two cases we need to handle:
4840 // - If this is a TEMPORARY copy (indicated by the GTF_VAR_DEATH flag) the variable
4841 // will remain live in its original register.
4842 // genProduceReg() will appropriately set the gcInfo for the copied value,
4843 // and genConsumeReg will reset it.
4844 // - Otherwise, we need to update register info for the lclVar.
4845
4846 GenTreeLclVarCommon* lcl = op1->AsLclVarCommon();
4847 assert((lcl->gtFlags & GTF_VAR_DEF) == 0);
4848
4849 if ((lcl->gtFlags & GTF_VAR_DEATH) == 0 && (treeNode->gtFlags & GTF_VAR_DEATH) == 0)
4850 {
4851 LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
4852
4853 // If we didn't just spill it (in genConsumeReg, above), then update the register info
4854 if (varDsc->lvRegNum != REG_STK)
4855 {
4856 // The old location is dying
4857 genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(op1));
4858
4859 gcInfo.gcMarkRegSetNpt(genRegMask(op1->gtRegNum));
4860
4861 genUpdateVarReg(varDsc, treeNode);
4862
4863 // The new location is going live
4864 genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(treeNode));
4865 }
4866 }
4867 }
4868 }
4869
4870 genProduceReg(treeNode);
4871}
4872
4873//------------------------------------------------------------------------
4874// genCodeForStoreInd: Produce code for a GT_STOREIND node.
4875//
4876// Arguments:
4877// tree - the GT_STOREIND node
4878//
4879void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
4880{
4881 assert(tree->OperIs(GT_STOREIND));
4882
4883#ifdef FEATURE_SIMD
4884 // Storing Vector3 of size 12 bytes through indirection
4885 if (tree->TypeGet() == TYP_SIMD12)
4886 {
4887 genStoreIndTypeSIMD12(tree);
4888 return;
4889 }
4890#endif // FEATURE_SIMD
4891
4892 GenTree* data = tree->Data();
4893 GenTree* addr = tree->Addr();
4894 var_types targetType = tree->TypeGet();
4895
4896 assert(!varTypeIsFloating(targetType) || (targetType == data->TypeGet()));
4897
4898 GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(tree, data);
4899 if (writeBarrierForm != GCInfo::WBF_NoBarrier)
4900 {
4901 // data and addr must be in registers.
4902 // Consume both registers so that any copies of interfering registers are taken care of.
4903 genConsumeOperands(tree);
4904
4905 if (genEmitOptimizedGCWriteBarrier(writeBarrierForm, addr, data))
4906 {
4907 return;
4908 }
4909
4910 // At this point, we should not have any interference.
4911 // That is, 'data' must not be in REG_ARG_0, as that is where 'addr' must go.
4912 noway_assert(data->gtRegNum != REG_ARG_0);
4913
4914 // addr goes in REG_ARG_0
4915 genCopyRegIfNeeded(addr, REG_ARG_0);
4916
4917 // data goes in REG_ARG_1
4918 genCopyRegIfNeeded(data, REG_ARG_1);
4919
4920 genGCWriteBarrier(tree, writeBarrierForm);
4921 }
4922 else
4923 {
4924 bool dataIsUnary = false;
4925 bool isRMWMemoryOp = tree->IsRMWMemoryOp();
4926 GenTree* rmwSrc = nullptr;
4927
4928 // We must consume the operands in the proper execution order, so that liveness is
4929 // updated appropriately.
4930 genConsumeAddress(addr);
4931
4932 // If tree represents a RMW memory op then its data is a non-leaf node marked as contained
4933 // and non-indir operand of data is the source of RMW memory op.
4934 if (isRMWMemoryOp)
4935 {
4936 assert(data->isContained() && !data->OperIsLeaf());
4937
4938 GenTree* rmwDst = nullptr;
4939
4940 dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != 0);
4941 if (!dataIsUnary)
4942 {
4943 if (tree->IsRMWDstOp1())
4944 {
4945 rmwDst = data->gtGetOp1();
4946 rmwSrc = data->gtGetOp2();
4947 }
4948 else
4949 {
4950 assert(tree->IsRMWDstOp2());
4951 rmwDst = data->gtGetOp2();
4952 rmwSrc = data->gtGetOp1();
4953 }
4954
4955 genConsumeRegs(rmwSrc);
4956 }
4957 else
4958 {
4959 // *(p) = oper *(p): Here addr = p, rmwsrc=rmwDst = *(p) i.e. GT_IND(p)
4960 // For unary RMW ops, src and dst of RMW memory op is the same. Lower
4961 // clears operand counts on rmwSrc and we don't need to perform a
4962 // genConsumeReg() on it.
4963 assert(tree->IsRMWDstOp1());
4964 rmwSrc = data->gtGetOp1();
4965 rmwDst = data->gtGetOp1();
4966 assert(rmwSrc->isUsedFromMemory());
4967 }
4968
4969 assert(rmwSrc != nullptr);
4970 assert(rmwDst != nullptr);
4971 assert(Lowering::IndirsAreEquivalent(rmwDst, tree));
4972 }
4973 else
4974 {
4975 genConsumeRegs(data);
4976 }
4977
4978 if (isRMWMemoryOp)
4979 {
4980 if (dataIsUnary)
4981 {
4982 // generate code for unary RMW memory ops like neg/not
4983 getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree), tree);
4984 }
4985 else
4986 {
4987 if (data->OperIsShiftOrRotate())
4988 {
4989 // Generate code for shift RMW memory ops.
4990 // The data address needs to be op1 (it must be [addr] = [addr] <shift> <amount>, not [addr] =
4991 // <amount> <shift> [addr]).
4992 assert(tree->IsRMWDstOp1());
4993 assert(rmwSrc == data->gtGetOp2());
4994 genCodeForShiftRMW(tree);
4995 }
4996 else if (data->OperGet() == GT_ADD && (rmwSrc->IsIntegralConst(1) || rmwSrc->IsIntegralConst(-1)))
4997 {
4998 // Generate "inc/dec [mem]" instead of "add/sub [mem], 1".
4999 //
5000 // Notes:
5001 // 1) Global morph transforms GT_SUB(x, +/-1) into GT_ADD(x, -/+1).
5002 // 2) TODO-AMD64: Debugger routine NativeWalker::Decode() runs into
5003 // an assert while decoding ModR/M byte of "inc dword ptr [rax]".
5004 // It is not clear whether Decode() can handle all possible
5005 // addr modes with inc/dec. For this reason, inc/dec [mem]
5006 // is not generated while generating debuggable code. Update
5007 // the above if condition once Decode() routine is fixed.
5008 assert(rmwSrc->isContainedIntOrIImmed());
5009 instruction ins = rmwSrc->IsIntegralConst(1) ? INS_inc : INS_dec;
5010 getEmitter()->emitInsRMW(ins, emitTypeSize(tree), tree);
5011 }
5012 else
5013 {
5014 // generate code for remaining binary RMW memory ops like add/sub/and/or/xor
5015 getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree),
5016 tree, rmwSrc);
5017 }
5018 }
5019 }
5020 else
5021 {
5022 getEmitter()->emitInsStoreInd(ins_Store(data->TypeGet()), emitTypeSize(tree), tree);
5023 }
5024 }
5025}
5026
5027//------------------------------------------------------------------------
5028// genCodeForSwap: Produce code for a GT_SWAP node.
5029//
5030// Arguments:
5031// tree - the GT_SWAP node
5032//
5033void CodeGen::genCodeForSwap(GenTreeOp* tree)
5034{
5035 assert(tree->OperIs(GT_SWAP));
5036
5037 // Swap is only supported for lclVar operands that are enregistered
5038 // We do not consume or produce any registers. Both operands remain enregistered.
5039 // However, the gc-ness may change.
5040 assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
5041
5042 GenTreeLclVarCommon* lcl1 = tree->gtOp1->AsLclVarCommon();
5043 LclVarDsc* varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
5044 var_types type1 = varDsc1->TypeGet();
5045 GenTreeLclVarCommon* lcl2 = tree->gtOp2->AsLclVarCommon();
5046 LclVarDsc* varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
5047 var_types type2 = varDsc2->TypeGet();
5048
5049 // We must have both int or both fp regs
5050 assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
5051
5052 // FP swap is not yet implemented (and should have NYI'd in LSRA)
5053 assert(!varTypeIsFloating(type1));
5054
5055 regNumber oldOp1Reg = lcl1->gtRegNum;
5056 regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
5057 regNumber oldOp2Reg = lcl2->gtRegNum;
5058 regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
5059
5060 // We don't call genUpdateVarReg because we don't have a tree node with the new register.
5061 varDsc1->lvRegNum = oldOp2Reg;
5062 varDsc2->lvRegNum = oldOp1Reg;
5063
5064 // Do the xchg
5065 emitAttr size = EA_PTRSIZE;
5066 if (varTypeGCtype(type1) != varTypeGCtype(type2))
5067 {
5068 // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
5069 // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
5070 size = EA_GCREF;
5071 }
5072 inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
5073
5074 // Update the gcInfo.
5075 // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
5076 gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
5077 gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
5078
5079 // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
5080 // It will also dump the updates.
5081 gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
5082 gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
5083}
5084
5085//------------------------------------------------------------------------
5086// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
5087// helper functions.
5088//
5089// Arguments:
5090// writeBarrierForm - the write barrier form to use
5091// addr - the address at which to do the store
5092// data - the data to store
5093//
5094// Return Value:
5095// true if an optimized write barrier form was used, false if not. If this
5096// function returns false, the caller must emit a "standard" write barrier.
5097
5098bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
5099{
5100 assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
5101
5102#if defined(_TARGET_X86_) && NOGC_WRITE_BARRIERS
5103 if (!genUseOptimizedWriteBarriers(writeBarrierForm))
5104 {
5105 return false;
5106 }
5107
5108 const static int regToHelper[2][8] = {
5109 // If the target is known to be in managed memory
5110 {
5111 CORINFO_HELP_ASSIGN_REF_EAX, // EAX
5112 CORINFO_HELP_ASSIGN_REF_ECX, // ECX
5113 -1, // EDX (always the target address)
5114 CORINFO_HELP_ASSIGN_REF_EBX, // EBX
5115 -1, // ESP
5116 CORINFO_HELP_ASSIGN_REF_EBP, // EBP
5117 CORINFO_HELP_ASSIGN_REF_ESI, // ESI
5118 CORINFO_HELP_ASSIGN_REF_EDI, // EDI
5119 },
5120
5121 // Don't know if the target is in managed memory
5122 {
5123 CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, // EAX
5124 CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, // ECX
5125 -1, // EDX (always the target address)
5126 CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, // EBX
5127 -1, // ESP
5128 CORINFO_HELP_CHECKED_ASSIGN_REF_EBP, // EBP
5129 CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, // ESI
5130 CORINFO_HELP_CHECKED_ASSIGN_REF_EDI, // EDI
5131 },
5132 };
5133
5134 noway_assert(regToHelper[0][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
5135 noway_assert(regToHelper[0][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
5136 noway_assert(regToHelper[0][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
5137 noway_assert(regToHelper[0][REG_ESP] == -1);
5138 noway_assert(regToHelper[0][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
5139 noway_assert(regToHelper[0][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
5140 noway_assert(regToHelper[0][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
5141
5142 noway_assert(regToHelper[1][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
5143 noway_assert(regToHelper[1][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
5144 noway_assert(regToHelper[1][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
5145 noway_assert(regToHelper[1][REG_ESP] == -1);
5146 noway_assert(regToHelper[1][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
5147 noway_assert(regToHelper[1][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
5148 noway_assert(regToHelper[1][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
5149
5150 regNumber reg = data->gtRegNum;
5151 noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
5152
5153 // Generate the following code:
5154 // lea edx, addr
5155 // call write_barrier_helper_reg
5156
5157 // addr goes in REG_ARG_0
5158 genCopyRegIfNeeded(addr, REG_WRITE_BARRIER);
5159
5160 unsigned tgtAnywhere = 0;
5161 if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
5162 {
5163 tgtAnywhere = 1;
5164 }
5165
5166 // We might want to call a modified version of genGCWriteBarrier() to get the benefit of
5167 // the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
5168 // with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
5169
5170 genEmitHelperCall(regToHelper[tgtAnywhere][reg],
5171 0, // argSize
5172 EA_PTRSIZE); // retSize
5173
5174 return true;
5175#else // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
5176 return false;
5177#endif // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
5178}
5179
5180// Produce code for a GT_CALL node
5181void CodeGen::genCallInstruction(GenTreeCall* call)
5182{
5183 genAlignStackBeforeCall(call);
5184
5185 gtCallTypes callType = (gtCallTypes)call->gtCallType;
5186
5187 IL_OFFSETX ilOffset = BAD_IL_OFFSET;
5188
5189 // all virtuals should have been expanded into a control expression
5190 assert(!call->IsVirtual() || call->gtControlExpr || call->gtCallAddr);
5191
5192 // Insert a GS check if necessary
5193 if (call->IsTailCallViaHelper())
5194 {
5195 if (compiler->getNeedsGSSecurityCookie())
5196 {
5197#if FEATURE_FIXED_OUT_ARGS
5198 // If either of the conditions below is true, we will need a temporary register in order to perform the GS
5199 // cookie check. When FEATURE_FIXED_OUT_ARGS is disabled, we save and restore the temporary register using
5200 // push/pop. When FEATURE_FIXED_OUT_ARGS is enabled, however, we need an alternative solution. For now,
5201 // though, the tail prefix is ignored on all platforms that use fixed out args, so we should never hit this
5202 // case.
5203 assert(compiler->gsGlobalSecurityCookieAddr == nullptr);
5204 assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
5205#endif
5206 genEmitGSCookieCheck(true);
5207 }
5208 }
5209
5210 // Consume all the arg regs
5211 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
5212 {
5213 assert(list->OperIsList());
5214
5215 GenTree* argNode = list->Current();
5216
5217 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode->gtSkipReloadOrCopy());
5218 assert(curArgTabEntry);
5219
5220 if (curArgTabEntry->regNum == REG_STK)
5221 {
5222 continue;
5223 }
5224
5225#ifdef UNIX_AMD64_ABI
5226 // Deal with multi register passed struct args.
5227 if (argNode->OperGet() == GT_FIELD_LIST)
5228 {
5229 GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
5230 unsigned iterationNum = 0;
5231 for (; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), iterationNum++)
5232 {
5233 GenTree* putArgRegNode = fieldListPtr->gtOp.gtOp1;
5234 assert(putArgRegNode->gtOper == GT_PUTARG_REG);
5235 regNumber argReg = REG_NA;
5236
5237 if (iterationNum == 0)
5238 {
5239 argReg = curArgTabEntry->regNum;
5240 }
5241 else
5242 {
5243 assert(iterationNum == 1);
5244 argReg = curArgTabEntry->otherRegNum;
5245 }
5246
5247 genConsumeReg(putArgRegNode);
5248
5249 // Validate the putArgRegNode has the right type.
5250 assert(varTypeIsFloating(putArgRegNode->TypeGet()) == genIsValidFloatReg(argReg));
5251 if (putArgRegNode->gtRegNum != argReg)
5252 {
5253 inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), false), argReg, putArgRegNode->gtRegNum);
5254 }
5255 }
5256 }
5257 else
5258#endif // UNIX_AMD64_ABI
5259 {
5260 regNumber argReg = curArgTabEntry->regNum;
5261 genConsumeReg(argNode);
5262 if (argNode->gtRegNum != argReg)
5263 {
5264 inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), false), argReg, argNode->gtRegNum);
5265 }
5266 }
5267
5268#if FEATURE_VARARG
5269 // In the case of a varargs call,
5270 // the ABI dictates that if we have floating point args,
5271 // we must pass the enregistered arguments in both the
5272 // integer and floating point registers so, let's do that.
5273 if (call->IsVarargs() && varTypeIsFloating(argNode))
5274 {
5275 regNumber targetReg = compiler->getCallArgIntRegister(argNode->gtRegNum);
5276 instruction ins = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
5277 inst_RV_RV(ins, argNode->gtRegNum, targetReg);
5278 }
5279#endif // FEATURE_VARARG
5280 }
5281
5282#if defined(_TARGET_X86_) || defined(UNIX_AMD64_ABI)
5283 // The call will pop its arguments.
5284 // for each putarg_stk:
5285 ssize_t stackArgBytes = 0;
5286 GenTree* args = call->gtCallArgs;
5287 while (args)
5288 {
5289 GenTree* arg = args->gtOp.gtOp1;
5290 if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
5291 {
5292 if (arg->OperGet() == GT_PUTARG_STK)
5293 {
5294 GenTree* source = arg->gtOp.gtOp1;
5295 unsigned size = arg->AsPutArgStk()->getArgSize();
5296 stackArgBytes += size;
5297#ifdef DEBUG
5298 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
5299 assert(curArgTabEntry);
5300 assert(size == (curArgTabEntry->numSlots * TARGET_POINTER_SIZE));
5301#ifdef FEATURE_PUT_STRUCT_ARG_STK
5302 if (source->TypeGet() == TYP_STRUCT)
5303 {
5304 GenTreeObj* obj = source->AsObj();
5305 unsigned argBytes = roundUp(obj->gtBlkSize, TARGET_POINTER_SIZE);
5306 assert((curArgTabEntry->numSlots * TARGET_POINTER_SIZE) == argBytes);
5307 }
5308#endif // FEATURE_PUT_STRUCT_ARG_STK
5309#endif // DEBUG
5310 }
5311 }
5312 args = args->gtOp.gtOp2;
5313 }
5314#endif // defined(_TARGET_X86_) || defined(UNIX_AMD64_ABI)
5315
5316 // Insert a null check on "this" pointer if asked.
5317 if (call->NeedsNullCheck())
5318 {
5319 const regNumber regThis = genGetThisArgReg(call);
5320 getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, regThis, regThis, 0);
5321 }
5322
5323 // Either gtControlExpr != null or gtCallAddr != null or it is a direct non-virtual call to a user or helper method.
5324 CORINFO_METHOD_HANDLE methHnd;
5325 GenTree* target = call->gtControlExpr;
5326 if (callType == CT_INDIRECT)
5327 {
5328 assert(target == nullptr);
5329 target = call->gtCallAddr;
5330 methHnd = nullptr;
5331 }
5332 else
5333 {
5334 methHnd = call->gtCallMethHnd;
5335 }
5336
5337 CORINFO_SIG_INFO* sigInfo = nullptr;
5338#ifdef DEBUG
5339 // Pass the call signature information down into the emitter so the emitter can associate
5340 // native call sites with the signatures they were generated from.
5341 if (callType != CT_HELPER)
5342 {
5343 sigInfo = call->callSig;
5344 }
5345#endif // DEBUG
5346
5347 // If fast tail call, then we are done. In this case we setup the args (both reg args
5348 // and stack args in incoming arg area) and call target in rax. Epilog sequence would
5349 // generate "jmp rax".
5350 if (call->IsFastTailCall())
5351 {
5352 // Don't support fast tail calling JIT helpers
5353 assert(callType != CT_HELPER);
5354
5355 // Fast tail calls materialize call target either in gtControlExpr or in gtCallAddr.
5356 assert(target != nullptr);
5357
5358 genConsumeReg(target);
5359 genCopyRegIfNeeded(target, REG_RAX);
5360 return;
5361 }
5362
5363 // For a pinvoke to unmanged code we emit a label to clear
5364 // the GC pointer state before the callsite.
5365 // We can't utilize the typical lazy killing of GC pointers
5366 // at (or inside) the callsite.
5367 if (compiler->killGCRefs(call))
5368 {
5369 genDefineTempLabel(genCreateTempLabel());
5370 }
5371
5372 // Determine return value size(s).
5373 ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
5374 emitAttr retSize = EA_PTRSIZE;
5375 emitAttr secondRetSize = EA_UNKNOWN;
5376
5377 if (call->HasMultiRegRetVal())
5378 {
5379 retSize = emitTypeSize(retTypeDesc->GetReturnRegType(0));
5380 secondRetSize = emitTypeSize(retTypeDesc->GetReturnRegType(1));
5381 }
5382 else
5383 {
5384 assert(!varTypeIsStruct(call));
5385
5386 if (call->gtType == TYP_REF)
5387 {
5388 retSize = EA_GCREF;
5389 }
5390 else if (call->gtType == TYP_BYREF)
5391 {
5392 retSize = EA_BYREF;
5393 }
5394 }
5395
5396#if defined(DEBUG) && defined(_TARGET_X86_)
5397 // Store the stack pointer so we can check it after the call.
5398 if (compiler->opts.compStackCheckOnCall && call->gtCallType == CT_USER_FUNC)
5399 {
5400 noway_assert(compiler->lvaCallSpCheck != 0xCCCCCCCC &&
5401 compiler->lvaTable[compiler->lvaCallSpCheck].lvDoNotEnregister &&
5402 compiler->lvaTable[compiler->lvaCallSpCheck].lvOnFrame);
5403 getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaCallSpCheck, 0);
5404 }
5405#endif // defined(DEBUG) && defined(_TARGET_X86_)
5406
5407 bool fPossibleSyncHelperCall = false;
5408 CorInfoHelpFunc helperNum = CORINFO_HELP_UNDEF;
5409
5410 // We need to propagate the IL offset information to the call instruction, so we can emit
5411 // an IL to native mapping record for the call, to support managed return value debugging.
5412 // We don't want tail call helper calls that were converted from normal calls to get a record,
5413 // so we skip this hash table lookup logic in that case.
5414 if (compiler->opts.compDbgInfo && compiler->genCallSite2ILOffsetMap != nullptr && !call->IsTailCall())
5415 {
5416 (void)compiler->genCallSite2ILOffsetMap->Lookup(call, &ilOffset);
5417 }
5418
5419#if defined(_TARGET_X86_)
5420 bool fCallerPop = call->CallerPop();
5421
5422#ifdef UNIX_X86_ABI
5423 if (!call->IsUnmanaged())
5424 {
5425 CorInfoCallConv callConv = CORINFO_CALLCONV_DEFAULT;
5426
5427 if ((callType != CT_HELPER) && call->callSig)
5428 {
5429 callConv = call->callSig->callConv;
5430 }
5431
5432 fCallerPop |= IsCallerPop(callConv);
5433 }
5434#endif // UNIX_X86_ABI
5435
5436 // If the callee pops the arguments, we pass a positive value as the argSize, and the emitter will
5437 // adjust its stack level accordingly.
5438 // If the caller needs to explicitly pop its arguments, we must pass a negative value, and then do the
5439 // pop when we're done.
5440 ssize_t argSizeForEmitter = stackArgBytes;
5441 if (fCallerPop)
5442 {
5443 argSizeForEmitter = -stackArgBytes;
5444 }
5445#endif // defined(_TARGET_X86_)
5446
5447 // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
5448 // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
5449 // transition penalty, assuming the user function contains legacy SSE instruction.
5450 // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
5451 // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
5452 // when there's preceding 256-bit AVX to legacy SSE transition penalty.
5453 if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
5454 {
5455 assert(compiler->canUseVexEncoding());
5456 instGen(INS_vzeroupper);
5457 }
5458
5459 if (target != nullptr)
5460 {
5461#ifdef _TARGET_X86_
5462 if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
5463 {
5464 // On x86, we need to generate a very specific pattern for indirect VSD calls:
5465 //
5466 // 3-byte nop
5467 // call dword ptr [eax]
5468 //
5469 // Where EAX is also used as an argument to the stub dispatch helper. Make
5470 // sure that the call target address is computed into EAX in this case.
5471
5472 assert(compiler->virtualStubParamInfo->GetReg() == REG_VIRTUAL_STUB_TARGET);
5473
5474 assert(target->isContainedIndir());
5475 assert(target->OperGet() == GT_IND);
5476
5477 GenTree* addr = target->AsIndir()->Addr();
5478 assert(addr->isUsedFromReg());
5479
5480 genConsumeReg(addr);
5481 genCopyRegIfNeeded(addr, REG_VIRTUAL_STUB_TARGET);
5482
5483 getEmitter()->emitIns_Nop(3);
5484
5485 // clang-format off
5486 getEmitter()->emitIns_Call(emitter::EmitCallType(emitter::EC_INDIR_ARD),
5487 methHnd,
5488 INDEBUG_LDISASM_COMMA(sigInfo)
5489 nullptr,
5490 argSizeForEmitter,
5491 retSize
5492 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5493 gcInfo.gcVarPtrSetCur,
5494 gcInfo.gcRegGCrefSetCur,
5495 gcInfo.gcRegByrefSetCur,
5496 ilOffset, REG_VIRTUAL_STUB_TARGET, REG_NA, 1, 0);
5497 // clang-format on
5498 }
5499 else
5500#endif
5501 if (target->isContainedIndir())
5502 {
5503 if (target->AsIndir()->HasBase() && target->AsIndir()->Base()->isContainedIntOrIImmed())
5504 {
5505 // Note that if gtControlExpr is an indir of an absolute address, we mark it as
5506 // contained only if it can be encoded as PC-relative offset.
5507 assert(target->AsIndir()->Base()->AsIntConCommon()->FitsInAddrBase(compiler));
5508
5509 // clang-format off
5510 genEmitCall(emitter::EC_FUNC_TOKEN_INDIR,
5511 methHnd,
5512 INDEBUG_LDISASM_COMMA(sigInfo)
5513 (void*) target->AsIndir()->Base()->AsIntConCommon()->IconValue()
5514 X86_ARG(argSizeForEmitter),
5515 retSize
5516 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5517 ilOffset);
5518 // clang-format on
5519 }
5520 else
5521 {
5522 // clang-format off
5523 genEmitCall(emitter::EC_INDIR_ARD,
5524 methHnd,
5525 INDEBUG_LDISASM_COMMA(sigInfo)
5526 target->AsIndir()
5527 X86_ARG(argSizeForEmitter),
5528 retSize
5529 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5530 ilOffset);
5531 // clang-format on
5532 }
5533 }
5534 else
5535 {
5536 // We have already generated code for gtControlExpr evaluating it into a register.
5537 // We just need to emit "call reg" in this case.
5538 assert(genIsValidIntReg(target->gtRegNum));
5539
5540 // clang-format off
5541 genEmitCall(emitter::EC_INDIR_R,
5542 methHnd,
5543 INDEBUG_LDISASM_COMMA(sigInfo)
5544 nullptr // addr
5545 X86_ARG(argSizeForEmitter),
5546 retSize
5547 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5548 ilOffset,
5549 genConsumeReg(target));
5550 // clang-format on
5551 }
5552 }
5553#ifdef FEATURE_READYTORUN_COMPILER
5554 else if (call->gtEntryPoint.addr != nullptr)
5555 {
5556 // clang-format off
5557 genEmitCall((call->gtEntryPoint.accessType == IAT_VALUE) ? emitter::EC_FUNC_TOKEN
5558 : emitter::EC_FUNC_TOKEN_INDIR,
5559 methHnd,
5560 INDEBUG_LDISASM_COMMA(sigInfo)
5561 (void*) call->gtEntryPoint.addr
5562 X86_ARG(argSizeForEmitter),
5563 retSize
5564 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5565 ilOffset);
5566 // clang-format on
5567 }
5568#endif
5569 else
5570 {
5571 // Generate a direct call to a non-virtual user defined or helper method
5572 assert(callType == CT_HELPER || callType == CT_USER_FUNC);
5573
5574 void* addr = nullptr;
5575 if (callType == CT_HELPER)
5576 {
5577 // Direct call to a helper method.
5578 helperNum = compiler->eeGetHelperNum(methHnd);
5579 noway_assert(helperNum != CORINFO_HELP_UNDEF);
5580
5581 void* pAddr = nullptr;
5582 addr = compiler->compGetHelperFtn(helperNum, (void**)&pAddr);
5583 assert(pAddr == nullptr);
5584
5585 // tracking of region protected by the monitor in synchronized methods
5586 if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
5587 {
5588 fPossibleSyncHelperCall = true;
5589 }
5590 }
5591 else
5592 {
5593 // Direct call to a non-virtual user function.
5594 addr = call->gtDirectCallAddress;
5595 }
5596
5597 assert(addr != nullptr);
5598
5599 // Non-virtual direct calls to known addresses
5600
5601 // clang-format off
5602 genEmitCall(emitter::EC_FUNC_TOKEN,
5603 methHnd,
5604 INDEBUG_LDISASM_COMMA(sigInfo)
5605 addr
5606 X86_ARG(argSizeForEmitter),
5607 retSize
5608 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5609 ilOffset);
5610 // clang-format on
5611 }
5612
5613 // if it was a pinvoke we may have needed to get the address of a label
5614 if (genPendingCallLabel)
5615 {
5616 assert(call->IsUnmanaged());
5617 genDefineTempLabel(genPendingCallLabel);
5618 genPendingCallLabel = nullptr;
5619 }
5620
5621 // Update GC info:
5622 // All Callee arg registers are trashed and no longer contain any GC pointers.
5623 // TODO-XArch-Bug?: As a matter of fact shouldn't we be killing all of callee trashed regs here?
5624 // For now we will assert that other than arg regs gc ref/byref set doesn't contain any other
5625 // registers from RBM_CALLEE_TRASH.
5626 assert((gcInfo.gcRegGCrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
5627 assert((gcInfo.gcRegByrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
5628 gcInfo.gcRegGCrefSetCur &= ~RBM_ARG_REGS;
5629 gcInfo.gcRegByrefSetCur &= ~RBM_ARG_REGS;
5630
5631 var_types returnType = call->TypeGet();
5632 if (returnType != TYP_VOID)
5633 {
5634#ifdef _TARGET_X86_
5635 if (varTypeIsFloating(returnType))
5636 {
5637 // Spill the value from the fp stack.
5638 // Then, load it into the target register.
5639 call->gtFlags |= GTF_SPILL;
5640 regSet.rsSpillFPStack(call);
5641 call->gtFlags |= GTF_SPILLED;
5642 call->gtFlags &= ~GTF_SPILL;
5643 }
5644 else
5645#endif // _TARGET_X86_
5646 {
5647 regNumber returnReg;
5648
5649 if (call->HasMultiRegRetVal())
5650 {
5651 assert(retTypeDesc != nullptr);
5652 unsigned regCount = retTypeDesc->GetReturnRegCount();
5653
5654 // If regs allocated to call node are different from ABI return
5655 // regs in which the call has returned its result, move the result
5656 // to regs allocated to call node.
5657 for (unsigned i = 0; i < regCount; ++i)
5658 {
5659 var_types regType = retTypeDesc->GetReturnRegType(i);
5660 returnReg = retTypeDesc->GetABIReturnReg(i);
5661 regNumber allocatedReg = call->GetRegNumByIdx(i);
5662 if (returnReg != allocatedReg)
5663 {
5664 inst_RV_RV(ins_Copy(regType), allocatedReg, returnReg, regType);
5665 }
5666 }
5667
5668#ifdef FEATURE_SIMD
5669 // A Vector3 return value is stored in xmm0 and xmm1.
5670 // RyuJIT assumes that the upper unused bits of xmm1 are cleared but
5671 // the native compiler doesn't guarantee it.
5672 if (returnType == TYP_SIMD12)
5673 {
5674 returnReg = retTypeDesc->GetABIReturnReg(1);
5675 // Clear the upper 32 bits by two shift instructions.
5676 // retReg = retReg << 96
5677 // retReg = retReg >> 96
5678 getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
5679 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
5680 }
5681#endif // FEATURE_SIMD
5682 }
5683 else
5684 {
5685#ifdef _TARGET_X86_
5686 if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
5687 {
5688 // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
5689 // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
5690 // correct argument registers.
5691 returnReg = REG_PINVOKE_TCB;
5692 }
5693 else
5694#endif // _TARGET_X86_
5695 if (varTypeIsFloating(returnType))
5696 {
5697 returnReg = REG_FLOATRET;
5698 }
5699 else
5700 {
5701 returnReg = REG_INTRET;
5702 }
5703
5704 if (call->gtRegNum != returnReg)
5705 {
5706 inst_RV_RV(ins_Copy(returnType), call->gtRegNum, returnReg, returnType);
5707 }
5708 }
5709
5710 genProduceReg(call);
5711 }
5712 }
5713
5714 // If there is nothing next, that means the result is thrown away, so this value is not live.
5715 // However, for minopts or debuggable code, we keep it live to support managed return value debugging.
5716 if ((call->gtNext == nullptr) && compiler->opts.OptimizationEnabled())
5717 {
5718 gcInfo.gcMarkRegSetNpt(RBM_INTRET);
5719 }
5720
5721#if defined(DEBUG) && defined(_TARGET_X86_)
5722 if (compiler->opts.compStackCheckOnCall && call->gtCallType == CT_USER_FUNC)
5723 {
5724 noway_assert(compiler->lvaCallSpCheck != 0xCCCCCCCC &&
5725 compiler->lvaTable[compiler->lvaCallSpCheck].lvDoNotEnregister &&
5726 compiler->lvaTable[compiler->lvaCallSpCheck].lvOnFrame);
5727 if (!fCallerPop && (stackArgBytes != 0))
5728 {
5729 // ECX is trashed, so can be used to compute the expected SP. We saved the value of SP
5730 // after pushing all the stack arguments, but the caller popped the arguments, so we need
5731 // to do some math to figure a good comparison.
5732 getEmitter()->emitIns_R_R(INS_mov, EA_4BYTE, REG_ARG_0, REG_SPBASE);
5733 getEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_ARG_0, stackArgBytes);
5734 getEmitter()->emitIns_S_R(INS_cmp, EA_4BYTE, REG_ARG_0, compiler->lvaCallSpCheck, 0);
5735 }
5736 else
5737 {
5738 getEmitter()->emitIns_S_R(INS_cmp, EA_4BYTE, REG_SPBASE, compiler->lvaCallSpCheck, 0);
5739 }
5740
5741 BasicBlock* sp_check = genCreateTempLabel();
5742 getEmitter()->emitIns_J(INS_je, sp_check);
5743 instGen(INS_BREAKPOINT);
5744 genDefineTempLabel(sp_check);
5745 }
5746#endif // defined(DEBUG) && defined(_TARGET_X86_)
5747
5748#if !FEATURE_EH_FUNCLETS
5749 //-------------------------------------------------------------------------
5750 // Create a label for tracking of region protected by the monitor in synchronized methods.
5751 // This needs to be here, rather than above where fPossibleSyncHelperCall is set,
5752 // so the GC state vars have been updated before creating the label.
5753
5754 if (fPossibleSyncHelperCall)
5755 {
5756 switch (helperNum)
5757 {
5758 case CORINFO_HELP_MON_ENTER:
5759 case CORINFO_HELP_MON_ENTER_STATIC:
5760 noway_assert(compiler->syncStartEmitCookie == NULL);
5761 compiler->syncStartEmitCookie =
5762 getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
5763 noway_assert(compiler->syncStartEmitCookie != NULL);
5764 break;
5765 case CORINFO_HELP_MON_EXIT:
5766 case CORINFO_HELP_MON_EXIT_STATIC:
5767 noway_assert(compiler->syncEndEmitCookie == NULL);
5768 compiler->syncEndEmitCookie =
5769 getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
5770 noway_assert(compiler->syncEndEmitCookie != NULL);
5771 break;
5772 default:
5773 break;
5774 }
5775 }
5776#endif // !FEATURE_EH_FUNCLETS
5777
5778 unsigned stackAdjustBias = 0;
5779
5780#if defined(_TARGET_X86_)
5781 // Is the caller supposed to pop the arguments?
5782 if (fCallerPop && (stackArgBytes != 0))
5783 {
5784 stackAdjustBias = stackArgBytes;
5785 }
5786
5787 SubtractStackLevel(stackArgBytes);
5788#endif // _TARGET_X86_
5789
5790 genRemoveAlignmentAfterCall(call, stackAdjustBias);
5791}
5792
5793// Produce code for a GT_JMP node.
5794// The arguments of the caller needs to be transferred to the callee before exiting caller.
5795// The actual jump to callee is generated as part of caller epilog sequence.
5796// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
5797void CodeGen::genJmpMethod(GenTree* jmp)
5798{
5799 assert(jmp->OperGet() == GT_JMP);
5800 assert(compiler->compJmpOpUsed);
5801
5802 // If no arguments, nothing to do
5803 if (compiler->info.compArgsCount == 0)
5804 {
5805 return;
5806 }
5807
5808 // Make sure register arguments are in their initial registers
5809 // and stack arguments are put back as well.
5810 unsigned varNum;
5811 LclVarDsc* varDsc;
5812
5813 // First move any en-registered stack arguments back to the stack.
5814 // At the same time any reg arg not in correct reg is moved back to its stack location.
5815 //
5816 // We are not strictly required to spill reg args that are not in the desired reg for a jmp call
5817 // But that would require us to deal with circularity while moving values around. Spilling
5818 // to stack makes the implementation simple, which is not a bad trade off given Jmp calls
5819 // are not frequent.
5820 for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
5821 {
5822 varDsc = compiler->lvaTable + varNum;
5823
5824 if (varDsc->lvPromoted)
5825 {
5826 noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
5827
5828 unsigned fieldVarNum = varDsc->lvFieldLclStart;
5829 varDsc = compiler->lvaTable + fieldVarNum;
5830 }
5831 noway_assert(varDsc->lvIsParam);
5832
5833 if (varDsc->lvIsRegArg && (varDsc->lvRegNum != REG_STK))
5834 {
5835 // Skip reg args which are already in its right register for jmp call.
5836 // If not, we will spill such args to their stack locations.
5837 //
5838 // If we need to generate a tail call profiler hook, then spill all
5839 // arg regs to free them up for the callback.
5840 if (!compiler->compIsProfilerHookNeeded() && (varDsc->lvRegNum == varDsc->lvArgReg))
5841 {
5842 continue;
5843 }
5844 }
5845 else if (varDsc->lvRegNum == REG_STK)
5846 {
5847 // Skip args which are currently living in stack.
5848 continue;
5849 }
5850
5851 // If we came here it means either a reg argument not in the right register or
5852 // a stack argument currently living in a register. In either case the following
5853 // assert should hold.
5854 assert(varDsc->lvRegNum != REG_STK);
5855
5856 assert(!varDsc->lvIsStructField || (compiler->lvaTable[varDsc->lvParentLcl].lvFieldCnt == 1));
5857 var_types storeType = genActualType(varDsc->lvaArgType()); // We own the memory and can use the full move.
5858 getEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), varDsc->lvRegNum, varNum, 0);
5859
5860 // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
5861 // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
5862 // Therefore manually update life of varDsc->lvRegNum.
5863 regMaskTP tempMask = varDsc->lvRegMask();
5864 regSet.RemoveMaskVars(tempMask);
5865 gcInfo.gcMarkRegSetNpt(tempMask);
5866 if (compiler->lvaIsGCTracked(varDsc))
5867 {
5868#ifdef DEBUG
5869 if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
5870 {
5871 JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
5872 }
5873 else
5874 {
5875 JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
5876 }
5877#endif // DEBUG
5878
5879 VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5880 }
5881 }
5882
5883#ifdef PROFILING_SUPPORTED
5884 // At this point all arg regs are free.
5885 // Emit tail call profiler callback.
5886 genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
5887#endif
5888
5889 // Next move any un-enregistered register arguments back to their register.
5890 regMaskTP fixedIntArgMask = RBM_NONE; // tracks the int arg regs occupying fixed args in case of a vararg method.
5891 unsigned firstArgVarNum = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
5892 for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
5893 {
5894 varDsc = compiler->lvaTable + varNum;
5895 if (varDsc->lvPromoted)
5896 {
5897 noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
5898
5899 unsigned fieldVarNum = varDsc->lvFieldLclStart;
5900 varDsc = compiler->lvaTable + fieldVarNum;
5901 }
5902 noway_assert(varDsc->lvIsParam);
5903
5904 // Skip if arg not passed in a register.
5905 if (!varDsc->lvIsRegArg)
5906 {
5907 continue;
5908 }
5909
5910#if defined(UNIX_AMD64_ABI)
5911 if (varTypeIsStruct(varDsc))
5912 {
5913 CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
5914 assert(typeHnd != nullptr);
5915
5916 SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
5917 compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
5918 assert(structDesc.passedInRegisters);
5919
5920 unsigned __int8 offset0 = 0;
5921 unsigned __int8 offset1 = 0;
5922 var_types type0 = TYP_UNKNOWN;
5923 var_types type1 = TYP_UNKNOWN;
5924
5925 // Get the eightbyte data
5926 compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
5927
5928 // Move the values into the right registers.
5929 //
5930
5931 // Update varDsc->lvArgReg and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
5932 // argReg is going live. Note that we cannot modify varDsc->lvRegNum and lvOtherArgReg here because another
5933 // basic block may not be expecting it. Therefore manually update life of argReg. Note that GT_JMP marks
5934 // the end of the basic block and after which reg life and gc info will be recomputed for the new block in
5935 // genCodeForBBList().
5936 if (type0 != TYP_UNKNOWN)
5937 {
5938 getEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->lvArgReg, varNum, offset0);
5939 regSet.rsMaskVars |= genRegMask(varDsc->lvArgReg);
5940 gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
5941 }
5942
5943 if (type1 != TYP_UNKNOWN)
5944 {
5945 getEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->lvOtherArgReg, varNum, offset1);
5946 regSet.rsMaskVars |= genRegMask(varDsc->lvOtherArgReg);
5947 gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
5948 }
5949
5950 if (varDsc->lvTracked)
5951 {
5952 VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5953 }
5954 }
5955 else
5956#endif // !defined(UNIX_AMD64_ABI)
5957 {
5958 // Register argument
5959 noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
5960
5961 // Is register argument already in the right register?
5962 // If not load it from its stack location.
5963 var_types loadType = varDsc->lvaArgType();
5964 regNumber argReg = varDsc->lvArgReg; // incoming arg register
5965
5966 if (varDsc->lvRegNum != argReg)
5967 {
5968 assert(genIsValidReg(argReg));
5969 getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
5970
5971 // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
5972 // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
5973 // Therefore manually update life of argReg. Note that GT_JMP marks the end of the basic block
5974 // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
5975 regSet.AddMaskVars(genRegMask(argReg));
5976 gcInfo.gcMarkRegPtrVal(argReg, loadType);
5977 if (compiler->lvaIsGCTracked(varDsc))
5978 {
5979#ifdef DEBUG
5980 if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
5981 {
5982 JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
5983 }
5984 else
5985 {
5986 JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
5987 }
5988#endif // DEBUG
5989
5990 VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5991 }
5992 }
5993 }
5994
5995#if FEATURE_VARARG && defined(_TARGET_AMD64_)
5996 // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
5997 // register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
5998 // be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
5999 // values on the stack.
6000 if (compiler->info.compIsVarArgs)
6001 {
6002 regNumber intArgReg;
6003 var_types loadType = varDsc->lvaArgType();
6004 regNumber argReg = varDsc->lvArgReg; // incoming arg register
6005
6006 if (varTypeIsFloating(loadType))
6007 {
6008 intArgReg = compiler->getCallArgIntRegister(argReg);
6009 instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
6010 inst_RV_RV(ins, argReg, intArgReg, loadType);
6011 }
6012 else
6013 {
6014 intArgReg = argReg;
6015 }
6016
6017 fixedIntArgMask |= genRegMask(intArgReg);
6018
6019 if (intArgReg == REG_ARG_0)
6020 {
6021 assert(firstArgVarNum == BAD_VAR_NUM);
6022 firstArgVarNum = varNum;
6023 }
6024 }
6025#endif // FEATURE_VARARG
6026 }
6027
6028#if FEATURE_VARARG && defined(_TARGET_AMD64_)
6029 // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
6030 // load the remaining arg registers (both int and float) from the corresponding
6031 // shadow stack slots. This is for the reason that we don't know the number and type
6032 // of non-fixed params passed by the caller, therefore we have to assume the worst case
6033 // of caller passing float/double args both in int and float arg regs.
6034 //
6035 // This doesn't apply to x86, which doesn't pass floating point values in floating
6036 // point registers.
6037 //
6038 // The caller could have passed gc-ref/byref type var args. Since these are var args
6039 // the callee no way of knowing their gc-ness. Therefore, mark the region that loads
6040 // remaining arg registers from shadow stack slots as non-gc interruptible.
6041 if (fixedIntArgMask != RBM_NONE)
6042 {
6043 assert(compiler->info.compIsVarArgs);
6044 assert(firstArgVarNum != BAD_VAR_NUM);
6045
6046 regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
6047 if (remainingIntArgMask != RBM_NONE)
6048 {
6049 instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
6050 getEmitter()->emitDisableGC();
6051 for (int argNum = 0, argOffset = 0; argNum < MAX_REG_ARG; ++argNum)
6052 {
6053 regNumber argReg = intArgRegs[argNum];
6054 regMaskTP argRegMask = genRegMask(argReg);
6055
6056 if ((remainingIntArgMask & argRegMask) != 0)
6057 {
6058 remainingIntArgMask &= ~argRegMask;
6059 getEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
6060
6061 // also load it in corresponding float arg reg
6062 regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
6063 inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
6064 }
6065
6066 argOffset += REGSIZE_BYTES;
6067 }
6068 getEmitter()->emitEnableGC();
6069 }
6070 }
6071#endif // FEATURE_VARARG
6072}
6073
6074// produce code for a GT_LEA subnode
6075void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
6076{
6077 emitAttr size = emitTypeSize(lea);
6078 genConsumeOperands(lea);
6079
6080 if (lea->Base() && lea->Index())
6081 {
6082 regNumber baseReg = lea->Base()->gtRegNum;
6083 regNumber indexReg = lea->Index()->gtRegNum;
6084 getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, baseReg, indexReg, lea->gtScale, lea->Offset());
6085 }
6086 else if (lea->Base())
6087 {
6088 getEmitter()->emitIns_R_AR(INS_lea, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->Offset());
6089 }
6090 else if (lea->Index())
6091 {
6092 getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, REG_NA, lea->Index()->gtRegNum, lea->gtScale,
6093 lea->Offset());
6094 }
6095
6096 genProduceReg(lea);
6097}
6098
6099//-------------------------------------------------------------------------------------------
6100// genJumpKindsForTree: Determine the number and kinds of conditional branches
6101// necessary to implement the given GT_CMP node
6102//
6103// Arguments:
6104// cmpTree - (input) The GenTree node that is used to set the Condition codes
6105// - The GenTree Relop node that was used to set the Condition codes
6106// jmpKind[2] - (output) One or two conditional branch instructions
6107// jmpToTrueLabel[2] - (output) When true we branch to the true case
6108// When false we create a second label and branch to the false case
6109// Only GT_EQ for a floating point compares can have a false value.
6110//
6111// Return Value:
6112// Sets the proper values into the array elements of jmpKind[] and jmpToTrueLabel[]
6113//
6114// Assumptions:
6115// At least one conditional branch instruction will be returned.
6116// Typically only one conditional branch is needed
6117// and the second jmpKind[] value is set to EJ_NONE
6118//
6119// Notes:
6120// jmpToTrueLabel[i]= true implies branch when the compare operation is true.
6121// jmpToTrueLabel[i]= false implies branch when the compare operation is false.
6122//-------------------------------------------------------------------------------------------
6123
6124// static
6125void CodeGen::genJumpKindsForTree(GenTree* cmpTree, emitJumpKind jmpKind[2], bool jmpToTrueLabel[2])
6126{
6127 // Except for BEQ (= ordered GT_EQ) both jumps are to the true label.
6128 jmpToTrueLabel[0] = true;
6129 jmpToTrueLabel[1] = true;
6130
6131 // For integer comparisons just use genJumpKindForOper
6132 if (!varTypeIsFloating(cmpTree->gtOp.gtOp1))
6133 {
6134 CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
6135 jmpKind[0] = genJumpKindForOper(cmpTree->gtOper, compareKind);
6136 jmpKind[1] = EJ_NONE;
6137 }
6138 else
6139 {
6140 assert(cmpTree->OperIsCompare());
6141
6142 // For details on how we arrived at this mapping, see the comment block in genCodeForTreeNode()
6143 // while generating code for compare opererators (e.g. GT_EQ etc).
6144 if ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) != 0)
6145 {
6146 // Must branch if we have an NaN, unordered
6147 switch (cmpTree->gtOper)
6148 {
6149 case GT_LT:
6150 case GT_GT:
6151 jmpKind[0] = EJ_jb;
6152 jmpKind[1] = EJ_NONE;
6153 break;
6154
6155 case GT_LE:
6156 case GT_GE:
6157 jmpKind[0] = EJ_jbe;
6158 jmpKind[1] = EJ_NONE;
6159 break;
6160
6161 case GT_NE:
6162 jmpKind[0] = EJ_jpe;
6163 jmpKind[1] = EJ_jne;
6164 break;
6165
6166 case GT_EQ:
6167 jmpKind[0] = EJ_je;
6168 jmpKind[1] = EJ_NONE;
6169 break;
6170
6171 default:
6172 unreached();
6173 }
6174 }
6175 else // ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) == 0)
6176 {
6177 // Do not branch if we have an NaN, unordered
6178 switch (cmpTree->gtOper)
6179 {
6180 case GT_LT:
6181 case GT_GT:
6182 jmpKind[0] = EJ_ja;
6183 jmpKind[1] = EJ_NONE;
6184 break;
6185
6186 case GT_LE:
6187 case GT_GE:
6188 jmpKind[0] = EJ_jae;
6189 jmpKind[1] = EJ_NONE;
6190 break;
6191
6192 case GT_NE:
6193 jmpKind[0] = EJ_jne;
6194 jmpKind[1] = EJ_NONE;
6195 break;
6196
6197 case GT_EQ:
6198 jmpKind[0] = EJ_jpe;
6199 jmpKind[1] = EJ_je;
6200 jmpToTrueLabel[0] = false;
6201 break;
6202
6203 default:
6204 unreached();
6205 }
6206 }
6207 }
6208}
6209
6210//------------------------------------------------------------------------
6211// genCompareFloat: Generate code for comparing two floating point values
6212//
6213// Arguments:
6214// treeNode - the compare tree
6215//
6216// Return Value:
6217// None.
6218// Comments:
6219// SSE2 instruction ucomis[s|d] is performs unordered comparison and
6220// updates rFLAGS register as follows.
6221// Result of compare ZF PF CF
6222// ----------------- ------------
6223// Unordered 1 1 1 <-- this result implies one of operands of compare is a NAN.
6224// Greater 0 0 0
6225// Less Than 0 0 1
6226// Equal 1 0 0
6227//
6228// From the above table the following equalities follow. As per ECMA spec *.UN opcodes perform
6229// unordered comparison of floating point values. That is *.UN comparisons result in true when
6230// one of the operands is a NaN whereas ordered comparisons results in false.
6231//
6232// Opcode Amd64 equivalent Comment
6233// ------ ----------------- --------
6234// BLT.UN(a,b) ucomis[s|d] a, b Jb branches if CF=1, which means either a<b or unordered from the above
6235// jb table
6236//
6237// BLT(a,b) ucomis[s|d] b, a Ja branches if CF=0 and ZF=0, which means b>a that in turn implies a<b
6238// ja
6239//
6240// BGT.UN(a,b) ucomis[s|d] b, a branch if b<a or unordered ==> branch if a>b or unordered
6241// jb
6242//
6243// BGT(a, b) ucomis[s|d] a, b branch if a>b
6244// ja
6245//
6246// BLE.UN(a,b) ucomis[s|d] a, b jbe branches if CF=1 or ZF=1, which implies a<=b or unordered
6247// jbe
6248//
6249// BLE(a,b) ucomis[s|d] b, a jae branches if CF=0, which mean b>=a or a<=b
6250// jae
6251//
6252// BGE.UN(a,b) ucomis[s|d] b, a branch if b<=a or unordered ==> branch if a>=b or unordered
6253// jbe
6254//
6255// BGE(a,b) ucomis[s|d] a, b branch if a>=b
6256// jae
6257//
6258// BEQ.UN(a,b) ucomis[s|d] a, b branch if a==b or unordered. There is no BEQ.UN opcode in ECMA spec.
6259// je This case is given for completeness, in case if JIT generates such
6260// a gentree internally.
6261//
6262// BEQ(a,b) ucomis[s|d] a, b From the above table, PF=0 and ZF=1 corresponds to a==b.
6263// jpe L1
6264// je <true label>
6265// L1:
6266//
6267// BNE(a,b) ucomis[s|d] a, b branch if a!=b. There is no BNE opcode in ECMA spec. This case is
6268// jne given for completeness, in case if JIT generates such a gentree
6269// internally.
6270//
6271// BNE.UN(a,b) ucomis[s|d] a, b From the above table, PF=1 or ZF=0 implies unordered or a!=b
6272// jpe <true label>
6273// jne <true label>
6274//
6275// As we can see from the above equalities that the operands of a compare operator need to be
6276// reversed in case of BLT/CLT, BGT.UN/CGT.UN, BLE/CLE, BGE.UN/CGE.UN.
6277void CodeGen::genCompareFloat(GenTree* treeNode)
6278{
6279 assert(treeNode->OperIsCompare());
6280
6281 GenTreeOp* tree = treeNode->AsOp();
6282 GenTree* op1 = tree->gtOp1;
6283 GenTree* op2 = tree->gtOp2;
6284 var_types op1Type = op1->TypeGet();
6285 var_types op2Type = op2->TypeGet();
6286
6287 genConsumeOperands(tree);
6288
6289 assert(varTypeIsFloating(op1Type));
6290 assert(op1Type == op2Type);
6291
6292 regNumber targetReg = treeNode->gtRegNum;
6293 instruction ins;
6294 emitAttr cmpAttr;
6295
6296 bool reverseOps;
6297 if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
6298 {
6299 // Unordered comparison case
6300 reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
6301 }
6302 else
6303 {
6304 reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
6305 }
6306
6307 if (reverseOps)
6308 {
6309 GenTree* tmp = op1;
6310 op1 = op2;
6311 op2 = tmp;
6312 }
6313
6314 ins = ins_FloatCompare(op1Type);
6315 cmpAttr = emitTypeSize(op1Type);
6316
6317 getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
6318
6319 // Are we evaluating this into a register?
6320 if (targetReg != REG_NA)
6321 {
6322 genSetRegToCond(targetReg, tree);
6323 genProduceReg(tree);
6324 }
6325}
6326
6327//------------------------------------------------------------------------
6328// genCompareInt: Generate code for comparing ints or, on amd64, longs.
6329//
6330// Arguments:
6331// treeNode - the compare tree
6332//
6333// Return Value:
6334// None.
6335void CodeGen::genCompareInt(GenTree* treeNode)
6336{
6337 assert(treeNode->OperIsCompare() || treeNode->OperIs(GT_CMP));
6338
6339 GenTreeOp* tree = treeNode->AsOp();
6340 GenTree* op1 = tree->gtOp1;
6341 GenTree* op2 = tree->gtOp2;
6342 var_types op1Type = op1->TypeGet();
6343 var_types op2Type = op2->TypeGet();
6344 regNumber targetReg = tree->gtRegNum;
6345
6346 genConsumeOperands(tree);
6347
6348 assert(!op1->isContainedIntOrIImmed());
6349 assert(!varTypeIsFloating(op2Type));
6350
6351 instruction ins;
6352 var_types type = TYP_UNKNOWN;
6353
6354 if (tree->OperIs(GT_TEST_EQ, GT_TEST_NE))
6355 {
6356 ins = INS_test;
6357
6358 // Unlike many xarch instructions TEST doesn't have a form with a 16/32/64 bit first operand and
6359 // an 8 bit immediate second operand. But if the immediate value fits in 8 bits then we can simply
6360 // emit a 8 bit TEST instruction, unless we're targeting x86 and the first operand is a non-byteable
6361 // register.
6362 // Note that lowering does something similar but its main purpose is to allow memory operands to be
6363 // contained so it doesn't handle other kind of operands. It could do more but on x86 that results
6364 // in additional register constrains and that may be worse than wasting 3 bytes on an immediate.
6365 if (
6366#ifdef _TARGET_X86_
6367 (!op1->isUsedFromReg() || isByteReg(op1->gtRegNum)) &&
6368#endif
6369 (op2->IsCnsIntOrI() && genSmallTypeCanRepresentValue(TYP_UBYTE, op2->AsIntCon()->IconValue())))
6370 {
6371 type = TYP_UBYTE;
6372 }
6373 }
6374 else if (op1->isUsedFromReg() && op2->IsIntegralConst(0))
6375 {
6376 // We're comparing a register to 0 so we can generate "test reg1, reg1"
6377 // instead of the longer "cmp reg1, 0"
6378 ins = INS_test;
6379 op2 = op1;
6380 }
6381 else
6382 {
6383 ins = INS_cmp;
6384 }
6385
6386 if (type == TYP_UNKNOWN)
6387 {
6388 if (op1Type == op2Type)
6389 {
6390 type = op1Type;
6391 }
6392 else if (genTypeSize(op1Type) == genTypeSize(op2Type))
6393 {
6394 // If the types are different but have the same size then we'll use TYP_INT or TYP_LONG.
6395 // This primarily deals with small type mixes (e.g. byte/ubyte) that need to be widened
6396 // and compared as int. We should not get long type mixes here but handle that as well
6397 // just in case.
6398 type = genTypeSize(op1Type) == 8 ? TYP_LONG : TYP_INT;
6399 }
6400 else
6401 {
6402 // In the types are different simply use TYP_INT. This deals with small type/int type
6403 // mixes (e.g. byte/short ubyte/int) that need to be widened and compared as int.
6404 // Lowering is expected to handle any mixes that involve long types (e.g. int/long).
6405 type = TYP_INT;
6406 }
6407
6408 // The common type cannot be smaller than any of the operand types, we're probably mixing int/long
6409 assert(genTypeSize(type) >= max(genTypeSize(op1Type), genTypeSize(op2Type)));
6410 // Small unsigned int types (TYP_BOOL can use anything) should use unsigned comparisons
6411 assert(!(varTypeIsSmallInt(type) && varTypeIsUnsigned(type)) || ((tree->gtFlags & GTF_UNSIGNED) != 0));
6412 // If op1 is smaller then it cannot be in memory, we're probably missing a cast
6413 assert((genTypeSize(op1Type) >= genTypeSize(type)) || !op1->isUsedFromMemory());
6414 // If op2 is smaller then it cannot be in memory, we're probably missing a cast
6415 assert((genTypeSize(op2Type) >= genTypeSize(type)) || !op2->isUsedFromMemory());
6416 // If we ended up with a small type and op2 is a constant then make sure we don't lose constant bits
6417 assert(!op2->IsCnsIntOrI() || !varTypeIsSmall(type) ||
6418 genSmallTypeCanRepresentValue(type, op2->AsIntCon()->IconValue()));
6419 }
6420
6421 // The type cannot be larger than the machine word size
6422 assert(genTypeSize(type) <= genTypeSize(TYP_I_IMPL));
6423 // TYP_UINT and TYP_ULONG should not appear here, only small types can be unsigned
6424 assert(!varTypeIsUnsigned(type) || varTypeIsSmall(type));
6425
6426 getEmitter()->emitInsBinary(ins, emitTypeSize(type), op1, op2);
6427
6428 // Are we evaluating this into a register?
6429 if (targetReg != REG_NA)
6430 {
6431 genSetRegToCond(targetReg, tree);
6432 genProduceReg(tree);
6433 }
6434}
6435
6436//-------------------------------------------------------------------------------------------
6437// genSetRegToCond: Set a register 'dstReg' to the appropriate one or zero value
6438// corresponding to a binary Relational operator result.
6439//
6440// Arguments:
6441// dstReg - The target register to set to 1 or 0
6442// tree - The GenTree Relop node that was used to set the Condition codes
6443//
6444// Return Value: none
6445//
6446// Notes:
6447// A full 64-bit value of either 1 or 0 is setup in the 'dstReg'
6448//-------------------------------------------------------------------------------------------
6449
6450void CodeGen::genSetRegToCond(regNumber dstReg, GenTree* tree)
6451{
6452 noway_assert((genRegMask(dstReg) & RBM_BYTE_REGS) != 0);
6453
6454 emitJumpKind jumpKind[2];
6455 bool branchToTrueLabel[2];
6456 genJumpKindsForTree(tree, jumpKind, branchToTrueLabel);
6457
6458 if (jumpKind[1] == EJ_NONE)
6459 {
6460 // Set (lower byte of) reg according to the flags
6461 inst_SET(jumpKind[0], dstReg);
6462 }
6463 else
6464 {
6465#ifdef DEBUG
6466 // jmpKind[1] != EJ_NONE implies BEQ and BEN.UN of floating point values.
6467 // These are represented by two conditions.
6468 if (tree->gtOper == GT_EQ)
6469 {
6470 // This must be an ordered comparison.
6471 assert((tree->gtFlags & GTF_RELOP_NAN_UN) == 0);
6472 }
6473 else
6474 {
6475 // This must be BNE.UN
6476 assert((tree->gtOper == GT_NE) && ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0));
6477 }
6478#endif
6479
6480 // Here is the sample code generated in each case:
6481 // BEQ == cmp, jpe <false label>, je <true label>
6482 // That is, to materialize comparison reg needs to be set if PF=0 and ZF=1
6483 // setnp reg // if (PF==0) reg = 1 else reg = 0
6484 // jpe L1 // Jmp if PF==1
6485 // sete reg
6486 // L1:
6487 //
6488 // BNE.UN == cmp, jpe <true label>, jne <true label>
6489 // That is, to materialize the comparison reg needs to be set if either PF=1 or ZF=0;
6490 // setp reg
6491 // jpe L1
6492 // setne reg
6493 // L1:
6494
6495 // reverse the jmpkind condition before setting dstReg if it is to false label.
6496 inst_SET(branchToTrueLabel[0] ? jumpKind[0] : emitter::emitReverseJumpKind(jumpKind[0]), dstReg);
6497
6498 BasicBlock* label = genCreateTempLabel();
6499 inst_JMP(jumpKind[0], label);
6500
6501 // second branch is always to true label
6502 assert(branchToTrueLabel[1]);
6503 inst_SET(jumpKind[1], dstReg);
6504 genDefineTempLabel(label);
6505 }
6506
6507 var_types treeType = tree->TypeGet();
6508 if (treeType == TYP_INT || treeType == TYP_LONG)
6509 {
6510 // Set the higher bytes to 0
6511 inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
6512 }
6513 else
6514 {
6515 noway_assert(treeType == TYP_BYTE);
6516 }
6517}
6518
6519#if !defined(_TARGET_64BIT_)
6520//------------------------------------------------------------------------
6521// genLongToIntCast: Generate code for long to int casts on x86.
6522//
6523// Arguments:
6524// cast - The GT_CAST node
6525//
6526// Return Value:
6527// None.
6528//
6529// Assumptions:
6530// The cast node and its sources (via GT_LONG) must have been assigned registers.
6531// The destination cannot be a floating point type or a small integer type.
6532//
6533void CodeGen::genLongToIntCast(GenTree* cast)
6534{
6535 assert(cast->OperGet() == GT_CAST);
6536
6537 GenTree* src = cast->gtGetOp1();
6538 noway_assert(src->OperGet() == GT_LONG);
6539
6540 genConsumeRegs(src);
6541
6542 var_types srcType = ((cast->gtFlags & GTF_UNSIGNED) != 0) ? TYP_ULONG : TYP_LONG;
6543 var_types dstType = cast->CastToType();
6544 regNumber loSrcReg = src->gtGetOp1()->gtRegNum;
6545 regNumber hiSrcReg = src->gtGetOp2()->gtRegNum;
6546 regNumber dstReg = cast->gtRegNum;
6547
6548 assert((dstType == TYP_INT) || (dstType == TYP_UINT));
6549 assert(genIsValidIntReg(loSrcReg));
6550 assert(genIsValidIntReg(hiSrcReg));
6551 assert(genIsValidIntReg(dstReg));
6552
6553 if (cast->gtOverflow())
6554 {
6555 //
6556 // Generate an overflow check for [u]long to [u]int casts:
6557 //
6558 // long -> int - check if the upper 33 bits are all 0 or all 1
6559 //
6560 // ulong -> int - check if the upper 33 bits are all 0
6561 //
6562 // long -> uint - check if the upper 32 bits are all 0
6563 // ulong -> uint - check if the upper 32 bits are all 0
6564 //
6565
6566 if ((srcType == TYP_LONG) && (dstType == TYP_INT))
6567 {
6568 BasicBlock* allOne = genCreateTempLabel();
6569 BasicBlock* success = genCreateTempLabel();
6570
6571 inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
6572 inst_JMP(EJ_js, allOne);
6573
6574 inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
6575 genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6576 inst_JMP(EJ_jmp, success);
6577
6578 genDefineTempLabel(allOne);
6579 inst_RV_IV(INS_cmp, hiSrcReg, -1, EA_4BYTE);
6580 genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6581
6582 genDefineTempLabel(success);
6583 }
6584 else
6585 {
6586 if ((srcType == TYP_ULONG) && (dstType == TYP_INT))
6587 {
6588 inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
6589 genJumpToThrowHlpBlk(EJ_js, SCK_OVERFLOW);
6590 }
6591
6592 inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
6593 genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6594 }
6595 }
6596
6597 if (dstReg != loSrcReg)
6598 {
6599 inst_RV_RV(INS_mov, dstReg, loSrcReg, TYP_INT, EA_4BYTE);
6600 }
6601
6602 genProduceReg(cast);
6603}
6604#endif
6605
6606//------------------------------------------------------------------------
6607// genIntCastOverflowCheck: Generate overflow checking code for an integer cast.
6608//
6609// Arguments:
6610// cast - The GT_CAST node
6611// desc - The cast description
6612// reg - The register containing the value to check
6613//
6614void CodeGen::genIntCastOverflowCheck(GenTreeCast* cast, const GenIntCastDesc& desc, regNumber reg)
6615{
6616 switch (desc.CheckKind())
6617 {
6618 case GenIntCastDesc::CHECK_POSITIVE:
6619 getEmitter()->emitIns_R_R(INS_test, EA_SIZE(desc.CheckSrcSize()), reg, reg);
6620 genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6621 break;
6622
6623#ifdef _TARGET_64BIT_
6624 case GenIntCastDesc::CHECK_UINT_RANGE:
6625 {
6626 // We need to check if the value is not greater than 0xFFFFFFFF but this value
6627 // cannot be encoded in an immediate operand. Use a right shift to test if the
6628 // upper 32 bits are zero. This requires a temporary register.
6629 const regNumber tempReg = cast->GetSingleTempReg();
6630 assert(tempReg != reg);
6631 getEmitter()->emitIns_R_R(INS_mov, EA_8BYTE, tempReg, reg);
6632 getEmitter()->emitIns_R_I(INS_shr_N, EA_8BYTE, tempReg, 32);
6633 genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6634 }
6635 break;
6636
6637 case GenIntCastDesc::CHECK_POSITIVE_INT_RANGE:
6638 getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
6639 genJumpToThrowHlpBlk(EJ_ja, SCK_OVERFLOW);
6640 break;
6641
6642 case GenIntCastDesc::CHECK_INT_RANGE:
6643 getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
6644 genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
6645 getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MIN);
6646 genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6647 break;
6648#endif
6649
6650 default:
6651 {
6652 assert(desc.CheckKind() == GenIntCastDesc::CHECK_SMALL_INT_RANGE);
6653 const int castMaxValue = desc.CheckSmallIntMax();
6654 const int castMinValue = desc.CheckSmallIntMin();
6655
6656 getEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMaxValue);
6657 genJumpToThrowHlpBlk((castMinValue == 0) ? EJ_ja : EJ_jg, SCK_OVERFLOW);
6658
6659 if (castMinValue != 0)
6660 {
6661 getEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMinValue);
6662 genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6663 }
6664 }
6665 break;
6666 }
6667}
6668
6669//------------------------------------------------------------------------
6670// genIntToIntCast: Generate code for an integer cast, with or without overflow check.
6671//
6672// Arguments:
6673// cast - The GT_CAST node
6674//
6675// Assumptions:
6676// The cast node is not a contained node and must have an assigned register.
6677// Neither the source nor target type can be a floating point type.
6678// On x86 casts to (U)BYTE require that the source be in a byte register.
6679//
6680// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
6681//
6682void CodeGen::genIntToIntCast(GenTreeCast* cast)
6683{
6684 genConsumeRegs(cast->gtGetOp1());
6685
6686 const regNumber srcReg = cast->gtGetOp1()->gtRegNum;
6687 const regNumber dstReg = cast->gtRegNum;
6688
6689 assert(genIsValidIntReg(srcReg));
6690 assert(genIsValidIntReg(dstReg));
6691
6692 GenIntCastDesc desc(cast);
6693
6694 if (desc.CheckKind() != GenIntCastDesc::CHECK_NONE)
6695 {
6696 genIntCastOverflowCheck(cast, desc, srcReg);
6697 }
6698
6699 if ((desc.ExtendKind() != GenIntCastDesc::COPY) || (srcReg != dstReg))
6700 {
6701 instruction ins;
6702 unsigned insSize;
6703
6704 switch (desc.ExtendKind())
6705 {
6706 case GenIntCastDesc::ZERO_EXTEND_SMALL_INT:
6707 ins = INS_movzx;
6708 insSize = desc.ExtendSrcSize();
6709 break;
6710 case GenIntCastDesc::SIGN_EXTEND_SMALL_INT:
6711 ins = INS_movsx;
6712 insSize = desc.ExtendSrcSize();
6713 break;
6714#ifdef _TARGET_64BIT_
6715 case GenIntCastDesc::ZERO_EXTEND_INT:
6716 ins = INS_mov;
6717 insSize = 4;
6718 break;
6719 case GenIntCastDesc::SIGN_EXTEND_INT:
6720 ins = INS_movsxd;
6721 insSize = 4;
6722 break;
6723#endif
6724 default:
6725 assert(desc.ExtendKind() == GenIntCastDesc::COPY);
6726 ins = INS_mov;
6727 insSize = desc.ExtendSrcSize();
6728 break;
6729 }
6730
6731 getEmitter()->emitIns_R_R(ins, EA_ATTR(insSize), dstReg, srcReg);
6732 }
6733
6734 genProduceReg(cast);
6735}
6736
6737//------------------------------------------------------------------------
6738// genFloatToFloatCast: Generate code for a cast between float and double
6739//
6740// Arguments:
6741// treeNode - The GT_CAST node
6742//
6743// Return Value:
6744// None.
6745//
6746// Assumptions:
6747// Cast is a non-overflow conversion.
6748// The treeNode must have an assigned register.
6749// The cast is between float and double or vice versa.
6750//
6751void CodeGen::genFloatToFloatCast(GenTree* treeNode)
6752{
6753 // float <--> double conversions are always non-overflow ones
6754 assert(treeNode->OperGet() == GT_CAST);
6755 assert(!treeNode->gtOverflow());
6756
6757 regNumber targetReg = treeNode->gtRegNum;
6758 assert(genIsValidFloatReg(targetReg));
6759
6760 GenTree* op1 = treeNode->gtOp.gtOp1;
6761#ifdef DEBUG
6762 // If not contained, must be a valid float reg.
6763 if (op1->isUsedFromReg())
6764 {
6765 assert(genIsValidFloatReg(op1->gtRegNum));
6766 }
6767#endif
6768
6769 var_types dstType = treeNode->CastToType();
6770 var_types srcType = op1->TypeGet();
6771 assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
6772
6773 genConsumeOperands(treeNode->AsOp());
6774 if (srcType == dstType && (op1->isUsedFromReg() && (targetReg == op1->gtRegNum)))
6775 {
6776 // source and destinations types are the same and also reside in the same register.
6777 // we just need to consume and produce the reg in this case.
6778 ;
6779 }
6780 else
6781 {
6782 instruction ins = ins_FloatConv(dstType, srcType);
6783 getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
6784 }
6785
6786 genProduceReg(treeNode);
6787}
6788
6789//------------------------------------------------------------------------
6790// genIntToFloatCast: Generate code to cast an int/long to float/double
6791//
6792// Arguments:
6793// treeNode - The GT_CAST node
6794//
6795// Return Value:
6796// None.
6797//
6798// Assumptions:
6799// Cast is a non-overflow conversion.
6800// The treeNode must have an assigned register.
6801// SrcType= int32/uint32/int64/uint64 and DstType=float/double.
6802//
6803void CodeGen::genIntToFloatCast(GenTree* treeNode)
6804{
6805 // int type --> float/double conversions are always non-overflow ones
6806 assert(treeNode->OperGet() == GT_CAST);
6807 assert(!treeNode->gtOverflow());
6808
6809 regNumber targetReg = treeNode->gtRegNum;
6810 assert(genIsValidFloatReg(targetReg));
6811
6812 GenTree* op1 = treeNode->gtOp.gtOp1;
6813#ifdef DEBUG
6814 if (op1->isUsedFromReg())
6815 {
6816 assert(genIsValidIntReg(op1->gtRegNum));
6817 }
6818#endif
6819
6820 var_types dstType = treeNode->CastToType();
6821 var_types srcType = op1->TypeGet();
6822 assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
6823
6824#if !defined(_TARGET_64BIT_)
6825 // We expect morph to replace long to float/double casts with helper calls
6826 noway_assert(!varTypeIsLong(srcType));
6827#endif // !defined(_TARGET_64BIT_)
6828
6829 // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
6830 // ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except
6831 // for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
6832 // as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
6833 // Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
6834 // temp and using temp as operand of cast operation.
6835 if (srcType == TYP_BYREF)
6836 {
6837 noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR || op1->OperGet() == GT_LCL_FLD_ADDR);
6838 srcType = TYP_I_IMPL;
6839 }
6840
6841 // force the srcType to unsigned if GT_UNSIGNED flag is set
6842 if (treeNode->gtFlags & GTF_UNSIGNED)
6843 {
6844 srcType = genUnsignedType(srcType);
6845 }
6846
6847 noway_assert(!varTypeIsGC(srcType));
6848
6849 // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
6850 // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
6851 // either the front-end or lowering phase to have generated two levels of cast.
6852 // The first one is for widening smaller int type to int32 and the second one is
6853 // to the float/double.
6854 emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
6855 noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
6856
6857 // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
6858 // here since they should have been lowered apropriately.
6859 noway_assert(srcType != TYP_UINT);
6860 noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
6861
6862 // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
6863 // which does a partial write to lower 4/8 bytes of xmm register keeping the other
6864 // upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
6865 // the partial write could introduce a false dependency and could cause a stall
6866 // if there are further uses of xmmReg. We have such a case occurring with a
6867 // customer reported version of SpectralNorm benchmark, resulting in 2x perf
6868 // regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
6869 // cvtsi2ss/sd instruction.
6870
6871 genConsumeOperands(treeNode->AsOp());
6872 getEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->gtRegNum, treeNode->gtRegNum);
6873
6874 // Note that here we need to specify srcType that will determine
6875 // the size of source reg/mem operand and rex.w prefix.
6876 instruction ins = ins_FloatConv(dstType, TYP_INT);
6877 getEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
6878
6879 // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
6880 // will interpret ULONG value as LONG. Hence we need to adjust the
6881 // result if sign-bit of srcType is set.
6882 if (srcType == TYP_ULONG)
6883 {
6884 // The instruction sequence below is less accurate than what clang
6885 // and gcc generate. However, we keep the current sequence for backward compatibility.
6886 // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
6887 // should be also updated for consistent conversion result.
6888 assert(dstType == TYP_DOUBLE);
6889 assert(op1->isUsedFromReg());
6890
6891 // Set the flags without modifying op1.
6892 // test op1Reg, op1Reg
6893 inst_RV_RV(INS_test, op1->gtRegNum, op1->gtRegNum, srcType);
6894
6895 // No need to adjust result if op1 >= 0 i.e. positive
6896 // Jge label
6897 BasicBlock* label = genCreateTempLabel();
6898 inst_JMP(EJ_jge, label);
6899
6900 // Adjust the result
6901 // result = result + 0x43f00000 00000000
6902 // addsd resultReg, 0x43f00000 00000000
6903 CORINFO_FIELD_HANDLE* cns = &u8ToDblBitmask;
6904 if (*cns == nullptr)
6905 {
6906 double d;
6907 static_assert_no_msg(sizeof(double) == sizeof(__int64));
6908 *((__int64*)&d) = 0x43f0000000000000LL;
6909
6910 *cns = getEmitter()->emitFltOrDblConst(d, EA_8BYTE);
6911 }
6912 getEmitter()->emitIns_R_C(INS_addsd, EA_8BYTE, treeNode->gtRegNum, *cns, 0);
6913
6914 genDefineTempLabel(label);
6915 }
6916
6917 genProduceReg(treeNode);
6918}
6919
6920//------------------------------------------------------------------------
6921// genFloatToIntCast: Generate code to cast float/double to int/long
6922//
6923// Arguments:
6924// treeNode - The GT_CAST node
6925//
6926// Return Value:
6927// None.
6928//
6929// Assumptions:
6930// Cast is a non-overflow conversion.
6931// The treeNode must have an assigned register.
6932// SrcType=float/double and DstType= int32/uint32/int64/uint64
6933//
6934// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
6935//
6936void CodeGen::genFloatToIntCast(GenTree* treeNode)
6937{
6938 // we don't expect to see overflow detecting float/double --> int type conversions here
6939 // as they should have been converted into helper calls by front-end.
6940 assert(treeNode->OperGet() == GT_CAST);
6941 assert(!treeNode->gtOverflow());
6942
6943 regNumber targetReg = treeNode->gtRegNum;
6944 assert(genIsValidIntReg(targetReg));
6945
6946 GenTree* op1 = treeNode->gtOp.gtOp1;
6947#ifdef DEBUG
6948 if (op1->isUsedFromReg())
6949 {
6950 assert(genIsValidFloatReg(op1->gtRegNum));
6951 }
6952#endif
6953
6954 var_types dstType = treeNode->CastToType();
6955 var_types srcType = op1->TypeGet();
6956 assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
6957
6958 // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
6959 // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
6960 // front-end or lowering phase to have generated two levels of cast. The first one is
6961 // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
6962 // the required smaller int type.
6963 emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
6964 noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
6965
6966 // We shouldn't be seeing uint64 here as it should have been converted
6967 // into a helper call by either front-end or lowering phase.
6968 noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
6969
6970 // If the dstType is TYP_UINT, we have 32-bits to encode the
6971 // float number. Any of 33rd or above bits can be the sign bit.
6972 // To achieve it we pretend as if we are converting it to a long.
6973 if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
6974 {
6975 dstType = TYP_LONG;
6976 }
6977
6978 // Note that we need to specify dstType here so that it will determine
6979 // the size of destination integer register and also the rex.w prefix.
6980 genConsumeOperands(treeNode->AsOp());
6981 instruction ins = ins_FloatConv(TYP_INT, srcType);
6982 getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
6983 genProduceReg(treeNode);
6984}
6985
6986//------------------------------------------------------------------------
6987// genCkfinite: Generate code for ckfinite opcode.
6988//
6989// Arguments:
6990// treeNode - The GT_CKFINITE node
6991//
6992// Return Value:
6993// None.
6994//
6995// Assumptions:
6996// GT_CKFINITE node has reserved an internal register.
6997//
6998// TODO-XArch-CQ - mark the operand as contained if known to be in
6999// memory (e.g. field or an array element).
7000//
7001void CodeGen::genCkfinite(GenTree* treeNode)
7002{
7003 assert(treeNode->OperGet() == GT_CKFINITE);
7004
7005 GenTree* op1 = treeNode->gtOp.gtOp1;
7006 var_types targetType = treeNode->TypeGet();
7007 int expMask = (targetType == TYP_FLOAT) ? 0x7F800000 : 0x7FF00000; // Bit mask to extract exponent.
7008 regNumber targetReg = treeNode->gtRegNum;
7009
7010 // Extract exponent into a register.
7011 regNumber tmpReg = treeNode->GetSingleTempReg();
7012
7013 genConsumeReg(op1);
7014
7015#ifdef _TARGET_64BIT_
7016
7017 // Copy the floating-point value to an integer register. If we copied a float to a long, then
7018 // right-shift the value so the high 32 bits of the floating-point value sit in the low 32
7019 // bits of the integer register.
7020 instruction ins = ins_CopyFloatToInt(targetType, (targetType == TYP_FLOAT) ? TYP_INT : TYP_LONG);
7021 inst_RV_RV(ins, op1->gtRegNum, tmpReg, targetType);
7022 if (targetType == TYP_DOUBLE)
7023 {
7024 // right shift by 32 bits to get to exponent.
7025 inst_RV_SH(INS_shr, EA_8BYTE, tmpReg, 32);
7026 }
7027
7028 // Mask exponent with all 1's and check if the exponent is all 1's
7029 inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
7030 inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
7031
7032 // If exponent is all 1's, throw ArithmeticException
7033 genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
7034
7035 // if it is a finite value copy it to targetReg
7036 if (targetReg != op1->gtRegNum)
7037 {
7038 inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7039 }
7040
7041#else // !_TARGET_64BIT_
7042
7043 // If the target type is TYP_DOUBLE, we want to extract the high 32 bits into the register.
7044 // There is no easy way to do this. To not require an extra register, we'll use shuffles
7045 // to move the high 32 bits into the low 32 bits, then shuffle it back, since we
7046 // need to produce the value into the target register.
7047 //
7048 // For TYP_DOUBLE, we'll generate (for targetReg != op1->gtRegNum):
7049 // movaps targetReg, op1->gtRegNum
7050 // shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
7051 // mov_xmm2i tmpReg, targetReg // tmpReg <= Y
7052 // and tmpReg, <mask>
7053 // cmp tmpReg, <mask>
7054 // je <throw block>
7055 // movaps targetReg, op1->gtRegNum // copy the value again, instead of un-shuffling it
7056 //
7057 // For TYP_DOUBLE with (targetReg == op1->gtRegNum):
7058 // shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
7059 // mov_xmm2i tmpReg, targetReg // tmpReg <= Y
7060 // and tmpReg, <mask>
7061 // cmp tmpReg, <mask>
7062 // je <throw block>
7063 // shufps targetReg, targetReg, 0xB1 // ZWXY => WZYX
7064 //
7065 // For TYP_FLOAT, it's the same as _TARGET_64BIT_:
7066 // mov_xmm2i tmpReg, targetReg // tmpReg <= low 32 bits
7067 // and tmpReg, <mask>
7068 // cmp tmpReg, <mask>
7069 // je <throw block>
7070 // movaps targetReg, op1->gtRegNum // only if targetReg != op1->gtRegNum
7071
7072 regNumber copyToTmpSrcReg; // The register we'll copy to the integer temp.
7073
7074 if (targetType == TYP_DOUBLE)
7075 {
7076 if (targetReg != op1->gtRegNum)
7077 {
7078 inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7079 }
7080 inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
7081 copyToTmpSrcReg = targetReg;
7082 }
7083 else
7084 {
7085 copyToTmpSrcReg = op1->gtRegNum;
7086 }
7087
7088 // Copy only the low 32 bits. This will be the high order 32 bits of the floating-point
7089 // value, no matter the floating-point type.
7090 inst_RV_RV(ins_CopyFloatToInt(TYP_FLOAT, TYP_INT), copyToTmpSrcReg, tmpReg, TYP_FLOAT);
7091
7092 // Mask exponent with all 1's and check if the exponent is all 1's
7093 inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
7094 inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
7095
7096 // If exponent is all 1's, throw ArithmeticException
7097 genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
7098
7099 if (targetReg != op1->gtRegNum)
7100 {
7101 // In both the TYP_FLOAT and TYP_DOUBLE case, the op1 register is untouched,
7102 // so copy it to the targetReg. This is faster and smaller for TYP_DOUBLE
7103 // than re-shuffling the targetReg.
7104 inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7105 }
7106 else if (targetType == TYP_DOUBLE)
7107 {
7108 // We need to re-shuffle the targetReg to get the correct result.
7109 inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
7110 }
7111
7112#endif // !_TARGET_64BIT_
7113
7114 genProduceReg(treeNode);
7115}
7116
7117#ifdef _TARGET_AMD64_
7118int CodeGenInterface::genSPtoFPdelta()
7119{
7120 int delta;
7121
7122#ifdef UNIX_AMD64_ABI
7123
7124 // We require frame chaining on Unix to support native tool unwinding (such as
7125 // unwinding by the native debugger). We have a CLR-only extension to the
7126 // unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
7127 // If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
7128 delta = genTotalFrameSize();
7129
7130#else // !UNIX_AMD64_ABI
7131
7132 // As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
7133 // RBP needs to be reported in unwind codes. This case would arise for methods
7134 // with localloc.
7135 if (compiler->compLocallocUsed)
7136 {
7137 // We cannot base delta computation on compLclFrameSize since it changes from
7138 // tentative to final frame layout and hence there is a possibility of
7139 // under-estimating offset of vars from FP, which in turn results in under-
7140 // estimating instruction size.
7141 //
7142 // To be predictive and so as never to under-estimate offset of vars from FP
7143 // we will always position FP at min(240, outgoing arg area size).
7144 delta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize);
7145 }
7146 else if (compiler->opts.compDbgEnC)
7147 {
7148 // vm assumption on EnC methods is that rsp and rbp are equal
7149 delta = 0;
7150 }
7151 else
7152 {
7153 delta = genTotalFrameSize();
7154 }
7155
7156#endif // !UNIX_AMD64_ABI
7157
7158 return delta;
7159}
7160
7161//---------------------------------------------------------------------
7162// genTotalFrameSize - return the total size of the stack frame, including local size,
7163// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
7164// return address.
7165//
7166// Return value:
7167// Total frame size
7168//
7169
7170int CodeGenInterface::genTotalFrameSize()
7171{
7172 assert(!IsUninitialized(compiler->compCalleeRegsPushed));
7173
7174 int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
7175
7176 assert(totalFrameSize >= 0);
7177 return totalFrameSize;
7178}
7179
7180//---------------------------------------------------------------------
7181// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
7182// This number is going to be negative, since the Caller-SP is at a higher
7183// address than the frame pointer.
7184//
7185// There must be a frame pointer to call this function!
7186//
7187// We can't compute this directly from the Caller-SP, since the frame pointer
7188// is based on a maximum delta from Initial-SP, so first we find SP, then
7189// compute the FP offset.
7190
7191int CodeGenInterface::genCallerSPtoFPdelta()
7192{
7193 assert(isFramePointerUsed());
7194 int callerSPtoFPdelta;
7195
7196 callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
7197
7198 assert(callerSPtoFPdelta <= 0);
7199 return callerSPtoFPdelta;
7200}
7201
7202//---------------------------------------------------------------------
7203// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
7204//
7205// This number will be negative.
7206
7207int CodeGenInterface::genCallerSPtoInitialSPdelta()
7208{
7209 int callerSPtoSPdelta = 0;
7210
7211 callerSPtoSPdelta -= genTotalFrameSize();
7212 callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
7213
7214 // compCalleeRegsPushed does not account for the frame pointer
7215 // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
7216 if (isFramePointerUsed())
7217 {
7218 callerSPtoSPdelta -= REGSIZE_BYTES;
7219 }
7220
7221 assert(callerSPtoSPdelta <= 0);
7222 return callerSPtoSPdelta;
7223}
7224#endif // _TARGET_AMD64_
7225
7226//-----------------------------------------------------------------------------------------
7227// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
7228//
7229// Arguments:
7230// treeNode - tree node
7231//
7232// Return value:
7233// None
7234//
7235// Assumptions:
7236// i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
7237// ii) tree type is floating point type.
7238// iii) caller of this routine needs to call genProduceReg()
7239void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
7240{
7241 regNumber targetReg = treeNode->gtRegNum;
7242 var_types targetType = treeNode->TypeGet();
7243 assert(varTypeIsFloating(targetType));
7244
7245 float f;
7246 double d;
7247 CORINFO_FIELD_HANDLE* bitMask = nullptr;
7248 instruction ins = INS_invalid;
7249 void* cnsAddr = nullptr;
7250 bool dblAlign = false;
7251
7252 switch (treeNode->OperGet())
7253 {
7254 case GT_NEG:
7255 // Neg(x) = flip the sign bit.
7256 // Neg(f) = f ^ 0x80000000
7257 // Neg(d) = d ^ 0x8000000000000000
7258 ins = INS_xorps;
7259 if (targetType == TYP_FLOAT)
7260 {
7261 bitMask = &negBitmaskFlt;
7262
7263 static_assert_no_msg(sizeof(float) == sizeof(int));
7264 *((int*)&f) = 0x80000000;
7265 cnsAddr = &f;
7266 }
7267 else
7268 {
7269 bitMask = &negBitmaskDbl;
7270
7271 static_assert_no_msg(sizeof(double) == sizeof(__int64));
7272 *((__int64*)&d) = 0x8000000000000000LL;
7273 cnsAddr = &d;
7274 dblAlign = true;
7275 }
7276 break;
7277
7278 case GT_INTRINSIC:
7279 assert(treeNode->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs);
7280
7281 // Abs(x) = set sign-bit to zero
7282 // Abs(f) = f & 0x7fffffff
7283 // Abs(d) = d & 0x7fffffffffffffff
7284 ins = INS_andps;
7285 if (targetType == TYP_FLOAT)
7286 {
7287 bitMask = &absBitmaskFlt;
7288
7289 static_assert_no_msg(sizeof(float) == sizeof(int));
7290 *((int*)&f) = 0x7fffffff;
7291 cnsAddr = &f;
7292 }
7293 else
7294 {
7295 bitMask = &absBitmaskDbl;
7296
7297 static_assert_no_msg(sizeof(double) == sizeof(__int64));
7298 *((__int64*)&d) = 0x7fffffffffffffffLL;
7299 cnsAddr = &d;
7300 dblAlign = true;
7301 }
7302 break;
7303
7304 default:
7305 assert(!"genSSE2: unsupported oper");
7306 unreached();
7307 break;
7308 }
7309
7310 if (*bitMask == nullptr)
7311 {
7312 assert(cnsAddr != nullptr);
7313 *bitMask = getEmitter()->emitAnyConst(cnsAddr, genTypeSize(targetType), dblAlign);
7314 }
7315
7316 // We need an additional register for bitmask.
7317 regNumber tmpReg = treeNode->GetSingleTempReg();
7318
7319 // Move operand into targetReg only if the reg reserved for
7320 // internal purpose is not the same as targetReg.
7321 GenTree* op1 = treeNode->gtOp.gtOp1;
7322 assert(op1->isUsedFromReg());
7323 regNumber operandReg = genConsumeReg(op1);
7324 if (tmpReg != targetReg)
7325 {
7326 if (operandReg != targetReg)
7327 {
7328 inst_RV_RV(ins_Copy(targetType), targetReg, operandReg, targetType);
7329 }
7330
7331 operandReg = tmpReg;
7332 }
7333
7334 getEmitter()->emitIns_R_C(ins_Load(targetType, false), emitTypeSize(targetType), tmpReg, *bitMask, 0);
7335 assert(ins != INS_invalid);
7336 inst_RV_RV(ins, targetReg, operandReg, targetType);
7337}
7338
7339//-----------------------------------------------------------------------------------------
7340// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation
7341//
7342// Arguments:
7343// treeNode - tree node
7344//
7345// Return value:
7346// None
7347//
7348// Assumptions:
7349// i) SSE4.1 is supported by the underlying hardware
7350// ii) treeNode oper is a GT_INTRINSIC
7351// iii) treeNode type is a floating point type
7352// iv) treeNode is not used from memory
7353// v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor
7354// vi) caller of this routine needs to call genProduceReg()
7355void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)
7356{
7357 // i) SSE4.1 is supported by the underlying hardware
7358 assert(compiler->compSupports(InstructionSet_SSE41));
7359
7360 // ii) treeNode oper is a GT_INTRINSIC
7361 assert(treeNode->OperGet() == GT_INTRINSIC);
7362
7363 GenTree* srcNode = treeNode->gtGetOp1();
7364
7365 // iii) treeNode type is floating point type
7366 assert(varTypeIsFloating(srcNode));
7367 assert(srcNode->TypeGet() == treeNode->TypeGet());
7368
7369 // iv) treeNode is not used from memory
7370 assert(!treeNode->isUsedFromMemory());
7371
7372 genConsumeOperands(treeNode);
7373
7374 instruction ins = (treeNode->TypeGet() == TYP_FLOAT) ? INS_roundss : INS_roundsd;
7375 emitAttr size = emitTypeSize(treeNode);
7376
7377 regNumber dstReg = treeNode->gtRegNum;
7378
7379 unsigned ival = 0;
7380
7381 // v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor
7382 switch (treeNode->gtIntrinsic.gtIntrinsicId)
7383 {
7384 case CORINFO_INTRINSIC_Round:
7385 ival = 4;
7386 break;
7387
7388 case CORINFO_INTRINSIC_Ceiling:
7389 ival = 10;
7390 break;
7391
7392 case CORINFO_INTRINSIC_Floor:
7393 ival = 9;
7394 break;
7395
7396 default:
7397 ins = INS_invalid;
7398 assert(!"genSSE41RoundOp: unsupported intrinsic");
7399 unreached();
7400 }
7401
7402 if (srcNode->isContained() || srcNode->isUsedFromSpillTemp())
7403 {
7404 emitter* emit = getEmitter();
7405
7406 TempDsc* tmpDsc = nullptr;
7407 unsigned varNum = BAD_VAR_NUM;
7408 unsigned offset = (unsigned)-1;
7409
7410 if (srcNode->isUsedFromSpillTemp())
7411 {
7412 assert(srcNode->IsRegOptional());
7413
7414 tmpDsc = getSpillTempDsc(srcNode);
7415 varNum = tmpDsc->tdTempNum();
7416 offset = 0;
7417
7418 regSet.tmpRlsTemp(tmpDsc);
7419 }
7420 else if (srcNode->isIndir())
7421 {
7422 GenTreeIndir* memIndir = srcNode->AsIndir();
7423 GenTree* memBase = memIndir->gtOp1;
7424
7425 switch (memBase->OperGet())
7426 {
7427 case GT_LCL_VAR_ADDR:
7428 {
7429 varNum = memBase->AsLclVarCommon()->GetLclNum();
7430 offset = 0;
7431
7432 // Ensure that all the GenTreeIndir values are set to their defaults.
7433 assert(memBase->gtRegNum == REG_NA);
7434 assert(!memIndir->HasIndex());
7435 assert(memIndir->Scale() == 1);
7436 assert(memIndir->Offset() == 0);
7437
7438 break;
7439 }
7440
7441 case GT_CLS_VAR_ADDR:
7442 {
7443 emit->emitIns_R_C_I(ins, size, dstReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
7444 return;
7445 }
7446
7447 default:
7448 {
7449 emit->emitIns_R_A_I(ins, size, dstReg, memIndir, ival);
7450 return;
7451 }
7452 }
7453 }
7454 else
7455 {
7456 switch (srcNode->OperGet())
7457 {
7458 case GT_CNS_DBL:
7459 {
7460 GenTreeDblCon* dblConst = srcNode->AsDblCon();
7461 CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(dblConst->gtDconVal, emitTypeSize(dblConst));
7462
7463 emit->emitIns_R_C_I(ins, size, dstReg, hnd, 0, ival);
7464 return;
7465 }
7466
7467 case GT_LCL_FLD:
7468 {
7469 GenTreeLclFld* lclField = srcNode->AsLclFld();
7470
7471 varNum = lclField->GetLclNum();
7472 offset = lclField->gtLclFld.gtLclOffs;
7473 break;
7474 }
7475
7476 case GT_LCL_VAR:
7477 {
7478 assert(srcNode->IsRegOptional() ||
7479 !compiler->lvaTable[srcNode->gtLclVar.gtLclNum].lvIsRegCandidate());
7480
7481 varNum = srcNode->AsLclVar()->GetLclNum();
7482 offset = 0;
7483 break;
7484 }
7485
7486 default:
7487 unreached();
7488 break;
7489 }
7490 }
7491
7492 // Ensure we got a good varNum and offset.
7493 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
7494 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
7495 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
7496 assert(offset != (unsigned)-1);
7497
7498 emit->emitIns_R_S_I(ins, size, dstReg, varNum, offset, ival);
7499 }
7500 else
7501 {
7502 inst_RV_RV_IV(ins, size, dstReg, srcNode->gtRegNum, ival);
7503 }
7504}
7505
7506//---------------------------------------------------------------------
7507// genIntrinsic - generate code for a given intrinsic
7508//
7509// Arguments
7510// treeNode - the GT_INTRINSIC node
7511//
7512// Return value:
7513// None
7514//
7515void CodeGen::genIntrinsic(GenTree* treeNode)
7516{
7517 // Right now only Sqrt/Abs are treated as math intrinsics.
7518 switch (treeNode->gtIntrinsic.gtIntrinsicId)
7519 {
7520 case CORINFO_INTRINSIC_Sqrt:
7521 {
7522 // Both operand and its result must be of the same floating point type.
7523 GenTree* srcNode = treeNode->gtOp.gtOp1;
7524 assert(varTypeIsFloating(srcNode));
7525 assert(srcNode->TypeGet() == treeNode->TypeGet());
7526
7527 genConsumeOperands(treeNode->AsOp());
7528 getEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode, srcNode);
7529 break;
7530 }
7531
7532 case CORINFO_INTRINSIC_Abs:
7533 genSSE2BitwiseOp(treeNode);
7534 break;
7535
7536 case CORINFO_INTRINSIC_Round:
7537 case CORINFO_INTRINSIC_Ceiling:
7538 case CORINFO_INTRINSIC_Floor:
7539 genSSE41RoundOp(treeNode->AsOp());
7540 break;
7541
7542 default:
7543 assert(!"genIntrinsic: Unsupported intrinsic");
7544 unreached();
7545 }
7546
7547 genProduceReg(treeNode);
7548}
7549
7550//-------------------------------------------------------------------------- //
7551// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
7552//
7553// Arguments
7554// treeNode - the GT_PUTARG_STK node
7555//
7556// Return value:
7557// The number of the base variable.
7558//
7559// Note:
7560// If tail call the outgoing args are placed in the caller's incoming arg stack space.
7561// Otherwise, they go in the outgoing arg area on the current frame.
7562//
7563// On Windows the caller always creates slots (homing space) in its frame for the
7564// first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
7565// For System V systems there is no such calling convention requirement, and the code needs to find
7566// the first stack passed argument from the caller. This is done by iterating over
7567// all the lvParam variables and finding the first with lvArgReg equals to REG_STK.
7568//
7569unsigned CodeGen::getBaseVarForPutArgStk(GenTree* treeNode)
7570{
7571 assert(treeNode->OperGet() == GT_PUTARG_STK);
7572
7573 unsigned baseVarNum;
7574
7575 // Whether to setup stk arg in incoming or out-going arg area?
7576 // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
7577 // All other calls - stk arg is setup in out-going arg area.
7578 if (treeNode->AsPutArgStk()->putInIncomingArgArea())
7579 {
7580 // See the note in the function header re: finding the first stack passed argument.
7581 baseVarNum = getFirstArgWithStackSlot();
7582 assert(baseVarNum != BAD_VAR_NUM);
7583
7584#ifdef DEBUG
7585 // This must be a fast tail call.
7586 assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
7587
7588 // Since it is a fast tail call, the existence of first incoming arg is guaranteed
7589 // because fast tail call requires that in-coming arg area of caller is >= out-going
7590 // arg area required for tail call.
7591 LclVarDsc* varDsc = &(compiler->lvaTable[baseVarNum]);
7592 assert(varDsc != nullptr);
7593
7594#ifdef UNIX_AMD64_ABI
7595 assert(!varDsc->lvIsRegArg && varDsc->lvArgReg == REG_STK);
7596#else // !UNIX_AMD64_ABI
7597 // On Windows this assert is always true. The first argument will always be in REG_ARG_0 or REG_FLTARG_0.
7598 assert(varDsc->lvIsRegArg && (varDsc->lvArgReg == REG_ARG_0 || varDsc->lvArgReg == REG_FLTARG_0));
7599#endif // !UNIX_AMD64_ABI
7600#endif // !DEBUG
7601 }
7602 else
7603 {
7604#if FEATURE_FIXED_OUT_ARGS
7605 baseVarNum = compiler->lvaOutgoingArgSpaceVar;
7606#else // !FEATURE_FIXED_OUT_ARGS
7607 assert(!"No BaseVarForPutArgStk on x86");
7608 baseVarNum = BAD_VAR_NUM;
7609#endif // !FEATURE_FIXED_OUT_ARGS
7610 }
7611
7612 return baseVarNum;
7613}
7614
7615//---------------------------------------------------------------------
7616// genAlignStackBeforeCall: Align the stack if necessary before a call.
7617//
7618// Arguments:
7619// putArgStk - the putArgStk node.
7620//
7621void CodeGen::genAlignStackBeforeCall(GenTreePutArgStk* putArgStk)
7622{
7623#if defined(UNIX_X86_ABI)
7624
7625 genAlignStackBeforeCall(putArgStk->gtCall);
7626
7627#endif // UNIX_X86_ABI
7628}
7629
7630//---------------------------------------------------------------------
7631// genAlignStackBeforeCall: Align the stack if necessary before a call.
7632//
7633// Arguments:
7634// call - the call node.
7635//
7636void CodeGen::genAlignStackBeforeCall(GenTreeCall* call)
7637{
7638#if defined(UNIX_X86_ABI)
7639
7640 // Have we aligned the stack yet?
7641 if (!call->fgArgInfo->IsStkAlignmentDone())
7642 {
7643 // We haven't done any stack alignment yet for this call. We might need to create
7644 // an alignment adjustment, even if this function itself doesn't have any stack args.
7645 // This can happen if this function call is part of a nested call sequence, and the outer
7646 // call has already pushed some arguments.
7647
7648 unsigned stkLevel = genStackLevel + call->fgArgInfo->GetStkSizeBytes();
7649 call->fgArgInfo->ComputeStackAlignment(stkLevel);
7650
7651 unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
7652 if (padStkAlign != 0)
7653 {
7654 // Now generate the alignment
7655 inst_RV_IV(INS_sub, REG_SPBASE, padStkAlign, EA_PTRSIZE);
7656 AddStackLevel(padStkAlign);
7657 AddNestedAlignment(padStkAlign);
7658 }
7659
7660 call->fgArgInfo->SetStkAlignmentDone();
7661 }
7662
7663#endif // UNIX_X86_ABI
7664}
7665
7666//---------------------------------------------------------------------
7667// genRemoveAlignmentAfterCall: After a call, remove the alignment
7668// added before the call, if any.
7669//
7670// Arguments:
7671// call - the call node.
7672// bias - additional stack adjustment
7673//
7674// Note:
7675// When bias > 0, caller should adjust stack level appropriately as
7676// bias is not considered when adjusting stack level.
7677//
7678void CodeGen::genRemoveAlignmentAfterCall(GenTreeCall* call, unsigned bias)
7679{
7680#if defined(_TARGET_X86_)
7681#if defined(UNIX_X86_ABI)
7682 // Put back the stack pointer if there was any padding for stack alignment
7683 unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
7684 unsigned padStkAdjust = padStkAlign + bias;
7685
7686 if (padStkAdjust != 0)
7687 {
7688 inst_RV_IV(INS_add, REG_SPBASE, padStkAdjust, EA_PTRSIZE);
7689 SubtractStackLevel(padStkAlign);
7690 SubtractNestedAlignment(padStkAlign);
7691 }
7692#else // UNIX_X86_ABI
7693 if (bias != 0)
7694 {
7695 genAdjustSP(bias);
7696 }
7697#endif // !UNIX_X86_ABI_
7698#else // _TARGET_X86_
7699 assert(bias == 0);
7700#endif // !_TARGET_X86
7701}
7702
7703#ifdef _TARGET_X86_
7704
7705//---------------------------------------------------------------------
7706// genAdjustStackForPutArgStk:
7707// adjust the stack pointer for a putArgStk node if necessary.
7708//
7709// Arguments:
7710// putArgStk - the putArgStk node.
7711//
7712// Returns: true if the stack pointer was adjusted; false otherwise.
7713//
7714// Notes:
7715// Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
7716// false if the stack arg needs to be stored at the current stack
7717// pointer address. This is exactly the opposite of the return value
7718// of this function.
7719//
7720bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
7721{
7722 const unsigned argSize = putArgStk->getArgSize();
7723 GenTree* source = putArgStk->gtGetOp1();
7724
7725#ifdef FEATURE_SIMD
7726 if (!source->OperIs(GT_FIELD_LIST) && varTypeIsSIMD(source))
7727 {
7728 inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
7729 AddStackLevel(argSize);
7730 m_pushStkArg = false;
7731 return true;
7732 }
7733#endif // FEATURE_SIMD
7734
7735 // If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack.
7736 // This is set in Lowering, and is true if and only if:
7737 // - This argument contains any GC pointers OR
7738 // - It is a GT_FIELD_LIST OR
7739 // - It is less than 16 bytes in size.
7740 CLANG_FORMAT_COMMENT_ANCHOR;
7741
7742#ifdef DEBUG
7743 switch (putArgStk->gtPutArgStkKind)
7744 {
7745 case GenTreePutArgStk::Kind::RepInstr:
7746 case GenTreePutArgStk::Kind::Unroll:
7747 assert((putArgStk->gtNumberReferenceSlots == 0) && (source->OperGet() != GT_FIELD_LIST) && (argSize >= 16));
7748 break;
7749 case GenTreePutArgStk::Kind::Push:
7750 case GenTreePutArgStk::Kind::PushAllSlots:
7751 assert((putArgStk->gtNumberReferenceSlots != 0) || (source->OperGet() == GT_FIELD_LIST) || (argSize < 16));
7752 break;
7753 case GenTreePutArgStk::Kind::Invalid:
7754 default:
7755 assert(!"Uninitialized GenTreePutArgStk::Kind");
7756 break;
7757 }
7758#endif // DEBUG
7759
7760 if (putArgStk->isPushKind())
7761 {
7762 m_pushStkArg = true;
7763 return false;
7764 }
7765 else
7766 {
7767 m_pushStkArg = false;
7768 inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
7769 AddStackLevel(argSize);
7770 return true;
7771 }
7772}
7773
7774//---------------------------------------------------------------------
7775// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
7776//
7777// Arguments
7778// treeNode - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
7779//
7780// Return value:
7781// None
7782//
7783void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
7784{
7785 GenTreeFieldList* const fieldList = putArgStk->gtOp1->AsFieldList();
7786 assert(fieldList != nullptr);
7787
7788 // Set m_pushStkArg and pre-adjust the stack if necessary.
7789 const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
7790
7791 // For now, we only support the "push" case; we will push a full slot for the first field of each slot
7792 // within the struct.
7793 assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
7794
7795 // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
7796 // (Note that this mode is not currently being used.)
7797 // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
7798 // in reverse order, so we start with the current field offset at the size of the struct arg (which must be
7799 // a multiple of the target pointer size).
7800 unsigned currentOffset = (preAdjustedStack) ? 0 : putArgStk->getArgSize();
7801 unsigned prevFieldOffset = currentOffset;
7802 regNumber intTmpReg = REG_NA;
7803 regNumber simdTmpReg = REG_NA;
7804 if (putArgStk->AvailableTempRegCount() != 0)
7805 {
7806 regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
7807 if ((rsvdRegs & RBM_ALLINT) != 0)
7808 {
7809 intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT);
7810 assert(genIsValidIntReg(intTmpReg));
7811 }
7812 if ((rsvdRegs & RBM_ALLFLOAT) != 0)
7813 {
7814 simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT);
7815 assert(genIsValidFloatReg(simdTmpReg));
7816 }
7817 assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
7818 }
7819
7820 for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
7821 {
7822 GenTree* const fieldNode = current->Current();
7823 const unsigned fieldOffset = current->gtFieldOffset;
7824 var_types fieldType = current->gtFieldType;
7825
7826 // Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the
7827 // field list in descending order by offset.
7828 assert(!varTypeIsLong(fieldType));
7829 assert(fieldOffset <= prevFieldOffset);
7830
7831 // Consume the register, if any, for this field. Note that genConsumeRegs() will appropriately
7832 // update the liveness info for a lclVar that has been marked RegOptional, which hasn't been
7833 // assigned a register, and which is therefore contained.
7834 // Unlike genConsumeReg(), it handles the case where no registers are being consumed.
7835 genConsumeRegs(fieldNode);
7836 regNumber argReg = fieldNode->isUsedFromSpillTemp() ? REG_NA : fieldNode->gtRegNum;
7837
7838 // If the field is slot-like, we can use a push instruction to store the entire register no matter the type.
7839 //
7840 // The GC encoder requires that the stack remain 4-byte aligned at all times. Round the adjustment up
7841 // to the next multiple of 4. If we are going to generate a `push` instruction, the adjustment must
7842 // not require rounding.
7843 // NOTE: if the field is of GC type, we must use a push instruction, since the emitter is not otherwise
7844 // able to detect stores into the outgoing argument area of the stack on x86.
7845 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
7846 int adjustment = roundUp(currentOffset - fieldOffset, 4);
7847 if (fieldIsSlot && !varTypeIsSIMD(fieldType))
7848 {
7849 fieldType = genActualType(fieldType);
7850 unsigned pushSize = genTypeSize(fieldType);
7851 assert((pushSize % 4) == 0);
7852 adjustment -= pushSize;
7853 while (adjustment != 0)
7854 {
7855 inst_IV(INS_push, 0);
7856 currentOffset -= pushSize;
7857 AddStackLevel(pushSize);
7858 adjustment -= pushSize;
7859 }
7860 m_pushStkArg = true;
7861 }
7862 else
7863 {
7864 m_pushStkArg = false;
7865
7866 // We always "push" floating point fields (i.e. they are full slot values that don't
7867 // require special handling).
7868 assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
7869
7870 // If we can't push this field, it needs to be in a register so that we can store
7871 // it to the stack location.
7872 if (adjustment != 0)
7873 {
7874 // This moves the stack pointer to fieldOffset.
7875 // For this case, we must adjust the stack and generate stack-relative stores rather than pushes.
7876 // Adjust the stack pointer to the next slot boundary.
7877 inst_RV_IV(INS_sub, REG_SPBASE, adjustment, EA_PTRSIZE);
7878 currentOffset -= adjustment;
7879 AddStackLevel(adjustment);
7880 }
7881
7882 // Does it need to be in a byte register?
7883 // If so, we'll use intTmpReg, which must have been allocated as a byte register.
7884 // If it's already in a register, but not a byteable one, then move it.
7885 if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
7886 {
7887 assert(intTmpReg != REG_NA);
7888 noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
7889 if (argReg != REG_NA)
7890 {
7891 inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
7892 argReg = intTmpReg;
7893 }
7894 }
7895 }
7896
7897 if (argReg == REG_NA)
7898 {
7899 if (m_pushStkArg)
7900 {
7901 if (fieldNode->isUsedFromSpillTemp())
7902 {
7903 assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
7904 assert(fieldNode->IsRegOptional());
7905 TempDsc* tmp = getSpillTempDsc(fieldNode);
7906 getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
7907 regSet.tmpRlsTemp(tmp);
7908 }
7909 else
7910 {
7911 assert(varTypeIsIntegralOrI(fieldNode));
7912 switch (fieldNode->OperGet())
7913 {
7914 case GT_LCL_VAR:
7915 inst_TT(INS_push, fieldNode, 0, 0, emitActualTypeSize(fieldNode->TypeGet()));
7916 break;
7917 case GT_CNS_INT:
7918 if (fieldNode->IsIconHandle())
7919 {
7920 inst_IV_handle(INS_push, fieldNode->gtIntCon.gtIconVal);
7921 }
7922 else
7923 {
7924 inst_IV(INS_push, fieldNode->gtIntCon.gtIconVal);
7925 }
7926 break;
7927 default:
7928 unreached();
7929 }
7930 }
7931 currentOffset -= TARGET_POINTER_SIZE;
7932 AddStackLevel(TARGET_POINTER_SIZE);
7933 }
7934 else
7935 {
7936 // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
7937 assert(varTypeIsIntegralOrI(fieldNode));
7938 switch (fieldNode->OperGet())
7939 {
7940 case GT_LCL_VAR:
7941 inst_RV_TT(INS_mov, intTmpReg, fieldNode);
7942 break;
7943 case GT_CNS_INT:
7944 genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
7945 break;
7946 default:
7947 unreached();
7948 }
7949 genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
7950 }
7951 }
7952 else
7953 {
7954#if defined(FEATURE_SIMD)
7955 if (fieldType == TYP_SIMD12)
7956 {
7957 assert(genIsValidFloatReg(simdTmpReg));
7958 genStoreSIMD12ToStack(argReg, simdTmpReg);
7959 }
7960 else
7961#endif // defined(FEATURE_SIMD)
7962 {
7963 genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
7964 }
7965 if (m_pushStkArg)
7966 {
7967 // We always push a slot-rounded size
7968 currentOffset -= genTypeSize(fieldType);
7969 }
7970 }
7971
7972 prevFieldOffset = fieldOffset;
7973 }
7974 if (currentOffset != 0)
7975 {
7976 // We don't expect padding at the beginning of a struct, but it could happen with explicit layout.
7977 inst_RV_IV(INS_sub, REG_SPBASE, currentOffset, EA_PTRSIZE);
7978 AddStackLevel(currentOffset);
7979 }
7980}
7981#endif // _TARGET_X86_
7982
7983//---------------------------------------------------------------------
7984// genPutArgStk - generate code for passing an arg on the stack.
7985//
7986// Arguments
7987// treeNode - the GT_PUTARG_STK node
7988// targetType - the type of the treeNode
7989//
7990// Return value:
7991// None
7992//
7993void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
7994{
7995 GenTree* data = putArgStk->gtOp1;
7996 var_types targetType = genActualType(data->TypeGet());
7997
7998#ifdef _TARGET_X86_
7999
8000 genAlignStackBeforeCall(putArgStk);
8001
8002 if ((data->OperGet() != GT_FIELD_LIST) && varTypeIsStruct(targetType))
8003 {
8004 (void)genAdjustStackForPutArgStk(putArgStk);
8005 genPutStructArgStk(putArgStk);
8006 return;
8007 }
8008
8009 // On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
8010 assert(targetType != TYP_LONG);
8011
8012 const unsigned argSize = putArgStk->getArgSize();
8013 assert((argSize % TARGET_POINTER_SIZE) == 0);
8014
8015 if (data->isContainedIntOrIImmed())
8016 {
8017 if (data->IsIconHandle())
8018 {
8019 inst_IV_handle(INS_push, data->gtIntCon.gtIconVal);
8020 }
8021 else
8022 {
8023 inst_IV(INS_push, data->gtIntCon.gtIconVal);
8024 }
8025 AddStackLevel(argSize);
8026 }
8027 else if (data->OperGet() == GT_FIELD_LIST)
8028 {
8029 genPutArgStkFieldList(putArgStk);
8030 }
8031 else
8032 {
8033 // We should not see any contained nodes that are not immediates.
8034 assert(data->isUsedFromReg());
8035 genConsumeReg(data);
8036 genPushReg(targetType, data->gtRegNum);
8037 }
8038#else // !_TARGET_X86_
8039 {
8040 unsigned baseVarNum = getBaseVarForPutArgStk(putArgStk);
8041
8042#ifdef UNIX_AMD64_ABI
8043
8044 if (data->OperIs(GT_FIELD_LIST))
8045 {
8046 genPutArgStkFieldList(putArgStk, baseVarNum);
8047 return;
8048 }
8049 else if (varTypeIsStruct(targetType))
8050 {
8051 m_stkArgVarNum = baseVarNum;
8052 m_stkArgOffset = putArgStk->getArgOffset();
8053 genPutStructArgStk(putArgStk);
8054 m_stkArgVarNum = BAD_VAR_NUM;
8055 return;
8056 }
8057#endif // UNIX_AMD64_ABI
8058
8059 noway_assert(targetType != TYP_STRUCT);
8060
8061 // Get argument offset on stack.
8062 // Here we cross check that argument offset hasn't changed from lowering to codegen since
8063 // we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
8064 int argOffset = putArgStk->getArgOffset();
8065
8066#ifdef DEBUG
8067 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(putArgStk->gtCall, putArgStk);
8068 assert(curArgTabEntry);
8069 assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
8070#endif
8071
8072 if (data->isContainedIntOrIImmed())
8073 {
8074 getEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), baseVarNum, argOffset,
8075 (int)data->AsIntConCommon()->IconValue());
8076 }
8077 else
8078 {
8079 assert(data->isUsedFromReg());
8080 genConsumeReg(data);
8081 getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, baseVarNum,
8082 argOffset);
8083 }
8084 }
8085#endif // !_TARGET_X86_
8086}
8087
8088//---------------------------------------------------------------------
8089// genPutArgReg - generate code for a GT_PUTARG_REG node
8090//
8091// Arguments
8092// tree - the GT_PUTARG_REG node
8093//
8094// Return value:
8095// None
8096//
8097void CodeGen::genPutArgReg(GenTreeOp* tree)
8098{
8099 assert(tree->OperIs(GT_PUTARG_REG));
8100
8101 var_types targetType = tree->TypeGet();
8102 regNumber targetReg = tree->gtRegNum;
8103
8104#ifndef UNIX_AMD64_ABI
8105 assert(targetType != TYP_STRUCT);
8106#endif // !UNIX_AMD64_ABI
8107
8108 GenTree* op1 = tree->gtOp1;
8109 genConsumeReg(op1);
8110
8111 // If child node is not already in the register we need, move it
8112 if (targetReg != op1->gtRegNum)
8113 {
8114 inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
8115 }
8116
8117 genProduceReg(tree);
8118}
8119
8120#ifdef _TARGET_X86_
8121// genPushReg: Push a register value onto the stack and adjust the stack level
8122//
8123// Arguments:
8124// type - the type of value to be stored
8125// reg - the register containing the value
8126//
8127// Notes:
8128// For TYP_LONG, the srcReg must be a floating point register.
8129// Otherwise, the register type must be consistent with the given type.
8130//
8131void CodeGen::genPushReg(var_types type, regNumber srcReg)
8132{
8133 unsigned size = genTypeSize(type);
8134 if (varTypeIsIntegralOrI(type) && type != TYP_LONG)
8135 {
8136 assert(genIsValidIntReg(srcReg));
8137 inst_RV(INS_push, srcReg, type);
8138 }
8139 else
8140 {
8141 instruction ins;
8142 emitAttr attr = emitTypeSize(type);
8143 if (type == TYP_LONG)
8144 {
8145 // On x86, the only way we can push a TYP_LONG from a register is if it is in an xmm reg.
8146 // This is only used when we are pushing a struct from memory to memory, and basically is
8147 // handling an 8-byte "chunk", as opposed to strictly a long type.
8148 ins = INS_movq;
8149 }
8150 else
8151 {
8152 ins = ins_Store(type);
8153 }
8154 assert(genIsValidFloatReg(srcReg));
8155 inst_RV_IV(INS_sub, REG_SPBASE, size, EA_PTRSIZE);
8156 getEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, 0);
8157 }
8158 AddStackLevel(size);
8159}
8160#endif // _TARGET_X86_
8161
8162#if defined(FEATURE_PUT_STRUCT_ARG_STK)
8163// genStoreRegToStackArg: Store a register value into the stack argument area
8164//
8165// Arguments:
8166// type - the type of value to be stored
8167// reg - the register containing the value
8168// offset - the offset from the base (see Assumptions below)
8169//
8170// Notes:
8171// A type of TYP_STRUCT instructs this method to store a 16-byte chunk
8172// at the given offset (i.e. not the full struct).
8173//
8174// Assumptions:
8175// The caller must set the context appropriately before calling this method:
8176// - On x64, m_stkArgVarNum must be set according to whether this is a regular or tail call.
8177// - On x86, the caller must set m_pushStkArg if this method should push the argument.
8178// Otherwise, the argument is stored at the given offset from sp.
8179//
8180// TODO: In the below code the load and store instructions are for 16 bytes, but the
8181// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
8182// this probably needs to be changed.
8183//
8184void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset)
8185{
8186 assert(srcReg != REG_NA);
8187 instruction ins;
8188 emitAttr attr;
8189 unsigned size;
8190
8191 if (type == TYP_STRUCT)
8192 {
8193 ins = INS_movdqu;
8194 // This should be changed!
8195 attr = EA_8BYTE;
8196 size = 16;
8197 }
8198 else
8199 {
8200#ifdef FEATURE_SIMD
8201 if (varTypeIsSIMD(type))
8202 {
8203 assert(genIsValidFloatReg(srcReg));
8204 ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
8205 }
8206 else
8207#endif // FEATURE_SIMD
8208#ifdef _TARGET_X86_
8209 if (type == TYP_LONG)
8210 {
8211 assert(genIsValidFloatReg(srcReg));
8212 ins = INS_movq;
8213 }
8214 else
8215#endif // _TARGET_X86_
8216 {
8217 assert((varTypeIsFloating(type) && genIsValidFloatReg(srcReg)) ||
8218 (varTypeIsIntegralOrI(type) && genIsValidIntReg(srcReg)));
8219 ins = ins_Store(type);
8220 }
8221 attr = emitTypeSize(type);
8222 size = genTypeSize(type);
8223 }
8224
8225#ifdef _TARGET_X86_
8226 if (m_pushStkArg)
8227 {
8228 genPushReg(type, srcReg);
8229 }
8230 else
8231 {
8232 getEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset);
8233 }
8234#else // !_TARGET_X86_
8235 assert(m_stkArgVarNum != BAD_VAR_NUM);
8236 getEmitter()->emitIns_S_R(ins, attr, srcReg, m_stkArgVarNum, m_stkArgOffset + offset);
8237#endif // !_TARGET_X86_
8238}
8239
8240//---------------------------------------------------------------------
8241// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
8242// In case there are references to heap object in the struct,
8243// it generates the gcinfo as well.
8244//
8245// Arguments
8246// putArgStk - the GT_PUTARG_STK node
8247//
8248// Notes:
8249// In the case of fixed out args, the caller must have set m_stkArgVarNum to the variable number
8250// corresponding to the argument area (where we will put the argument on the stack).
8251// For tail calls this is the baseVarNum = 0.
8252// For non tail calls this is the outgoingArgSpace.
8253void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
8254{
8255 GenTree* source = putArgStk->gtGetOp1();
8256 var_types targetType = source->TypeGet();
8257
8258#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
8259 if (putArgStk->isSIMD12())
8260 {
8261 genPutArgStkSIMD12(putArgStk);
8262 return;
8263 }
8264#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
8265
8266 if (varTypeIsSIMD(targetType))
8267 {
8268 regNumber srcReg = genConsumeReg(source);
8269 assert((srcReg != REG_NA) && (genIsValidFloatReg(srcReg)));
8270 genStoreRegToStackArg(targetType, srcReg, 0);
8271 return;
8272 }
8273
8274 assert(targetType == TYP_STRUCT);
8275
8276 if (putArgStk->gtNumberReferenceSlots == 0)
8277 {
8278 switch (putArgStk->gtPutArgStkKind)
8279 {
8280 case GenTreePutArgStk::Kind::RepInstr:
8281 genStructPutArgRepMovs(putArgStk);
8282 break;
8283 case GenTreePutArgStk::Kind::Unroll:
8284 genStructPutArgUnroll(putArgStk);
8285 break;
8286 case GenTreePutArgStk::Kind::Push:
8287 genStructPutArgUnroll(putArgStk);
8288 break;
8289 default:
8290 unreached();
8291 }
8292 }
8293 else
8294 {
8295 // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
8296 CLANG_FORMAT_COMMENT_ANCHOR;
8297
8298#ifdef _TARGET_X86_
8299 // On x86, any struct that has contains GC references must be stored to the stack using `push` instructions so
8300 // that the emitter properly detects the need to update the method's GC information.
8301 //
8302 // Strictly speaking, it is only necessary to use `push` to store the GC references themselves, so for structs
8303 // with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the
8304 // future.
8305 assert(m_pushStkArg);
8306
8307 GenTree* srcAddr = source->gtGetOp1();
8308 BYTE* gcPtrs = putArgStk->gtGcPtrs;
8309 const unsigned numSlots = putArgStk->gtNumSlots;
8310
8311 regNumber srcRegNum = srcAddr->gtRegNum;
8312 const bool srcAddrInReg = srcRegNum != REG_NA;
8313
8314 unsigned srcLclNum = 0;
8315 unsigned srcLclOffset = 0;
8316 if (srcAddrInReg)
8317 {
8318 genConsumeReg(srcAddr);
8319 }
8320 else
8321 {
8322 assert(srcAddr->OperIsLocalAddr());
8323
8324 srcLclNum = srcAddr->AsLclVarCommon()->gtLclNum;
8325 if (srcAddr->OperGet() == GT_LCL_FLD_ADDR)
8326 {
8327 srcLclOffset = srcAddr->AsLclFld()->gtLclOffs;
8328 }
8329 }
8330
8331 for (int i = numSlots - 1; i >= 0; --i)
8332 {
8333 emitAttr slotAttr;
8334 if (gcPtrs[i] == TYPE_GC_NONE)
8335 {
8336 slotAttr = EA_4BYTE;
8337 }
8338 else if (gcPtrs[i] == TYPE_GC_REF)
8339 {
8340 slotAttr = EA_GCREF;
8341 }
8342 else
8343 {
8344 assert(gcPtrs[i] == TYPE_GC_BYREF);
8345 slotAttr = EA_BYREF;
8346 }
8347
8348 const unsigned offset = i * TARGET_POINTER_SIZE;
8349 if (srcAddrInReg)
8350 {
8351 getEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, offset);
8352 }
8353 else
8354 {
8355 getEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + offset);
8356 }
8357 AddStackLevel(TARGET_POINTER_SIZE);
8358 }
8359#else // !defined(_TARGET_X86_)
8360
8361 // Consume these registers.
8362 // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
8363 genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA);
8364
8365 const bool srcIsLocal = putArgStk->gtOp1->AsObj()->gtOp1->OperIsLocalAddr();
8366 const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF;
8367
8368#if DEBUG
8369 unsigned numGCSlotsCopied = 0;
8370#endif // DEBUG
8371
8372 BYTE* gcPtrs = putArgStk->gtGcPtrs;
8373 const unsigned numSlots = putArgStk->gtNumSlots;
8374 for (unsigned i = 0; i < numSlots;)
8375 {
8376 if (gcPtrs[i] == TYPE_GC_NONE)
8377 {
8378 // Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively)
8379 // instead of a sequence of movsp instructions to save cycles and code size.
8380 unsigned adjacentNonGCSlotCount = 0;
8381 do
8382 {
8383 adjacentNonGCSlotCount++;
8384 i++;
8385 } while ((i < numSlots) && (gcPtrs[i] == TYPE_GC_NONE));
8386
8387 // If we have a very small contiguous non-ref region, it's better just to
8388 // emit a sequence of movsp instructions
8389 if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
8390 {
8391 for (; adjacentNonGCSlotCount > 0; adjacentNonGCSlotCount--)
8392 {
8393 instGen(INS_movsp);
8394 }
8395 }
8396 else
8397 {
8398 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount);
8399 instGen(INS_r_movsp);
8400 }
8401 }
8402 else
8403 {
8404 assert((gcPtrs[i] == TYPE_GC_REF) || (gcPtrs[i] == TYPE_GC_BYREF));
8405
8406 // We have a GC (byref or ref) pointer
8407 // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction,
8408 // but the logic for emitting a GC info record is not available (it is internal for the emitter
8409 // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do
8410 // instGen(INS_movsp); and emission of gc info.
8411
8412 var_types memType = (gcPtrs[i] == TYPE_GC_REF) ? TYP_REF : TYP_BYREF;
8413 getEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0);
8414 genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE);
8415#ifdef DEBUG
8416 numGCSlotsCopied++;
8417#endif // DEBUG
8418
8419 i++;
8420 if (i < numSlots)
8421 {
8422 // Source for the copy operation.
8423 // If a LocalAddr, use EA_PTRSIZE - copy from stack.
8424 // If not a LocalAddr, use EA_BYREF - the source location is not on the stack.
8425 getEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE);
8426
8427 // Always copying to the stack - outgoing arg area
8428 // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE.
8429 getEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE);
8430 }
8431 }
8432 }
8433
8434 assert(numGCSlotsCopied == putArgStk->gtNumberReferenceSlots);
8435#endif // _TARGET_X86_
8436 }
8437}
8438#endif // defined(FEATURE_PUT_STRUCT_ARG_STK)
8439
8440/*****************************************************************************
8441 *
8442 * Create and record GC Info for the function.
8443 */
8444#ifndef JIT32_GCENCODER
8445void
8446#else // !JIT32_GCENCODER
8447void*
8448#endif // !JIT32_GCENCODER
8449CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
8450{
8451#ifdef JIT32_GCENCODER
8452 return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
8453#else // !JIT32_GCENCODER
8454 genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
8455#endif // !JIT32_GCENCODER
8456}
8457
8458#ifdef JIT32_GCENCODER
8459void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
8460 unsigned prologSize,
8461 unsigned epilogSize DEBUGARG(void* codePtr))
8462{
8463 BYTE headerBuf[64];
8464 InfoHdr header;
8465
8466 int s_cached;
8467
8468#ifdef WIN64EXCEPTIONS
8469 // We should do this before gcInfoBlockHdrSave since varPtrTableSize must be finalized before it
8470 if (compiler->ehAnyFunclets())
8471 {
8472 gcInfo.gcMarkFilterVarsPinned();
8473 }
8474#endif
8475
8476#ifdef DEBUG
8477 size_t headerSize =
8478#endif
8479 compiler->compInfoBlkSize =
8480 gcInfo.gcInfoBlockHdrSave(headerBuf, 0, codeSize, prologSize, epilogSize, &header, &s_cached);
8481
8482 size_t argTabOffset = 0;
8483 size_t ptrMapSize = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
8484
8485#if DISPLAY_SIZES
8486
8487 if (genInterruptible)
8488 {
8489 gcHeaderISize += compiler->compInfoBlkSize;
8490 gcPtrMapISize += ptrMapSize;
8491 }
8492 else
8493 {
8494 gcHeaderNSize += compiler->compInfoBlkSize;
8495 gcPtrMapNSize += ptrMapSize;
8496 }
8497
8498#endif // DISPLAY_SIZES
8499
8500 compiler->compInfoBlkSize += ptrMapSize;
8501
8502 /* Allocate the info block for the method */
8503
8504 compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
8505
8506#if 0 // VERBOSE_SIZES
8507 // TODO-X86-Cleanup: 'dataSize', below, is not defined
8508
8509// if (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
8510 {
8511 printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
8512 compiler->info.compILCodeSize,
8513 compiler->compInfoBlkSize,
8514 codeSize + dataSize,
8515 codeSize + dataSize - prologSize - epilogSize,
8516 100 * (codeSize + dataSize) / compiler->info.compILCodeSize,
8517 100 * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
8518 compiler->info.compClassName,
8519 compiler->info.compMethodName);
8520}
8521
8522#endif
8523
8524 /* Fill in the info block and return it to the caller */
8525
8526 void* infoPtr = compiler->compInfoBlkAddr;
8527
8528 /* Create the method info block: header followed by GC tracking tables */
8529
8530 compiler->compInfoBlkAddr +=
8531 gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -1, codeSize, prologSize, epilogSize, &header, &s_cached);
8532
8533 assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
8534 compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
8535 assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
8536
8537#ifdef DEBUG
8538
8539 if (0)
8540 {
8541 BYTE* temp = (BYTE*)infoPtr;
8542 unsigned size = compiler->compInfoBlkAddr - temp;
8543 BYTE* ptab = temp + headerSize;
8544
8545 noway_assert(size == headerSize + ptrMapSize);
8546
8547 printf("Method info block - header [%u bytes]:", headerSize);
8548
8549 for (unsigned i = 0; i < size; i++)
8550 {
8551 if (temp == ptab)
8552 {
8553 printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
8554 printf("\n %04X: %*c", i & ~0xF, 3 * (i & 0xF), ' ');
8555 }
8556 else
8557 {
8558 if (!(i % 16))
8559 printf("\n %04X: ", i);
8560 }
8561
8562 printf("%02X ", *temp++);
8563 }
8564
8565 printf("\n");
8566 }
8567
8568#endif // DEBUG
8569
8570#if DUMP_GC_TABLES
8571
8572 if (compiler->opts.dspGCtbls)
8573 {
8574 const BYTE* base = (BYTE*)infoPtr;
8575 unsigned size;
8576 unsigned methodSize;
8577 InfoHdr dumpHeader;
8578
8579 printf("GC Info for method %s\n", compiler->info.compFullName);
8580 printf("GC info size = %3u\n", compiler->compInfoBlkSize);
8581
8582 size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
8583 // printf("size of header encoding is %3u\n", size);
8584 printf("\n");
8585
8586 if (compiler->opts.dspGCtbls)
8587 {
8588 base += size;
8589 size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
8590 // printf("size of pointer table is %3u\n", size);
8591 printf("\n");
8592 noway_assert(compiler->compInfoBlkAddr == (base + size));
8593 }
8594 }
8595
8596#ifdef DEBUG
8597 if (jitOpts.testMask & 128)
8598 {
8599 for (unsigned offs = 0; offs < codeSize; offs++)
8600 {
8601 gcInfo.gcFindPtrsInFrame(infoPtr, codePtr, offs);
8602 }
8603 }
8604#endif // DEBUG
8605#endif // DUMP_GC_TABLES
8606
8607 /* Make sure we ended up generating the expected number of bytes */
8608
8609 noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
8610
8611 return infoPtr;
8612}
8613
8614#else // !JIT32_GCENCODER
8615void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
8616{
8617 IAllocator* allowZeroAlloc = new (compiler, CMK_GC) CompIAllocator(compiler->getAllocatorGC());
8618 GcInfoEncoder* gcInfoEncoder = new (compiler, CMK_GC)
8619 GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
8620 assert(gcInfoEncoder);
8621
8622 // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
8623 gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
8624
8625 // We keep the call count for the second call to gcMakeRegPtrTable() below.
8626 unsigned callCnt = 0;
8627 // First we figure out the encoder ID's for the stack slots and registers.
8628 gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS, &callCnt);
8629 // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
8630 gcInfoEncoder->FinalizeSlotIds();
8631 // Now we can actually use those slot ID's to declare live ranges.
8632 gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK, &callCnt);
8633
8634 if (compiler->opts.compDbgEnC)
8635 {
8636 // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
8637 // which is:
8638 // -return address
8639 // -saved off RBP
8640 // -saved 'this' pointer and bool for synchronized methods
8641
8642 // 4 slots for RBP + return address + RSI + RDI
8643 int preservedAreaSize = 4 * REGSIZE_BYTES;
8644
8645 if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
8646 {
8647 if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
8648 {
8649 preservedAreaSize += REGSIZE_BYTES;
8650 }
8651
8652 // bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
8653 preservedAreaSize += 4;
8654 }
8655
8656 // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
8657 // frame
8658 gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
8659 }
8660
8661 if (compiler->opts.IsReversePInvoke())
8662 {
8663 unsigned reversePInvokeFrameVarNumber = compiler->lvaReversePInvokeFrameVar;
8664 assert(reversePInvokeFrameVarNumber != BAD_VAR_NUM && reversePInvokeFrameVarNumber < compiler->lvaRefCount);
8665 LclVarDsc& reversePInvokeFrameVar = compiler->lvaTable[reversePInvokeFrameVarNumber];
8666 gcInfoEncoder->SetReversePInvokeFrameSlot(reversePInvokeFrameVar.lvStkOffs);
8667 }
8668
8669 gcInfoEncoder->Build();
8670
8671 // GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
8672 // let's save the values anyway for debugging purposes
8673 compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
8674 compiler->compInfoBlkSize = 0; // not exposed by the GCEncoder interface
8675}
8676#endif // !JIT32_GCENCODER
8677
8678/*****************************************************************************
8679 * Emit a call to a helper function.
8680 *
8681 */
8682
8683void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
8684{
8685 void* addr = nullptr;
8686 void* pAddr = nullptr;
8687
8688 emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
8689 addr = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
8690 regNumber callTarget = REG_NA;
8691 regMaskTP killMask = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
8692
8693 if (!addr)
8694 {
8695 assert(pAddr != nullptr);
8696
8697 // Absolute indirect call addr
8698 // Note: Order of checks is important. First always check for pc-relative and next
8699 // zero-relative. Because the former encoding is 1-byte smaller than the latter.
8700 if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) ||
8701 genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
8702 {
8703 // generate call whose target is specified by 32-bit offset relative to PC or zero.
8704 callType = emitter::EC_FUNC_TOKEN_INDIR;
8705 addr = pAddr;
8706 }
8707 else
8708 {
8709#ifdef _TARGET_AMD64_
8710 // If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
8711 // load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
8712 // make the call.
8713 // mov reg, addr
8714 // call [reg]
8715
8716 if (callTargetReg == REG_NA)
8717 {
8718 // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
8719 // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
8720 callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET;
8721 regMaskTP callTargetMask = genRegMask(callTargetReg);
8722 noway_assert((callTargetMask & killMask) == callTargetMask);
8723 }
8724 else
8725 {
8726 // The call target must not overwrite any live variable, though it may not be in the
8727 // kill set for the call.
8728 regMaskTP callTargetMask = genRegMask(callTargetReg);
8729 noway_assert((callTargetMask & regSet.rsMaskVars) == RBM_NONE);
8730 }
8731#endif
8732
8733 callTarget = callTargetReg;
8734 CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
8735 callType = emitter::EC_INDIR_ARD;
8736 }
8737 }
8738
8739 // clang-format off
8740 getEmitter()->emitIns_Call(callType,
8741 compiler->eeFindHelper(helper),
8742 INDEBUG_LDISASM_COMMA(nullptr) addr,
8743 argSize,
8744 retSize
8745 MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(EA_UNKNOWN),
8746 gcInfo.gcVarPtrSetCur,
8747 gcInfo.gcRegGCrefSetCur,
8748 gcInfo.gcRegByrefSetCur,
8749 BAD_IL_OFFSET, // IL offset
8750 callTarget, // ireg
8751 REG_NA, 0, 0, // xreg, xmul, disp
8752 false // isJump
8753 );
8754 // clang-format on
8755
8756 regSet.verifyRegistersUsed(killMask);
8757}
8758
8759/*****************************************************************************
8760* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
8761* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
8762* disassembler thinks the instructions as the same as we do.
8763*/
8764
8765// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
8766// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
8767//#define ALL_XARCH_EMITTER_UNIT_TESTS
8768
8769#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
8770void CodeGen::genAmd64EmitterUnitTests()
8771{
8772 if (!verbose)
8773 {
8774 return;
8775 }
8776
8777 if (!compiler->opts.altJit)
8778 {
8779 // No point doing this in a "real" JIT.
8780 return;
8781 }
8782
8783 // Mark the "fake" instructions in the output.
8784 printf("*************** In genAmd64EmitterUnitTests()\n");
8785
8786 // We use this:
8787 // genDefineTempLabel(genCreateTempLabel());
8788 // to create artificial labels to help separate groups of tests.
8789
8790 //
8791 // Loads
8792 //
8793 CLANG_FORMAT_COMMENT_ANCHOR;
8794
8795#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
8796 genDefineTempLabel(genCreateTempLabel());
8797
8798 // vhaddpd ymm0,ymm1,ymm2
8799 getEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8800 // vaddss xmm0,xmm1,xmm2
8801 getEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8802 // vaddsd xmm0,xmm1,xmm2
8803 getEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8804 // vaddps xmm0,xmm1,xmm2
8805 getEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8806 // vaddps ymm0,ymm1,ymm2
8807 getEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8808 // vaddpd xmm0,xmm1,xmm2
8809 getEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8810 // vaddpd ymm0,ymm1,ymm2
8811 getEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8812 // vsubss xmm0,xmm1,xmm2
8813 getEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8814 // vsubsd xmm0,xmm1,xmm2
8815 getEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8816 // vsubps ymm0,ymm1,ymm2
8817 getEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8818 // vsubps ymm0,ymm1,ymm2
8819 getEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8820 // vsubpd xmm0,xmm1,xmm2
8821 getEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8822 // vsubpd ymm0,ymm1,ymm2
8823 getEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8824 // vmulss xmm0,xmm1,xmm2
8825 getEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8826 // vmulsd xmm0,xmm1,xmm2
8827 getEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8828 // vmulps xmm0,xmm1,xmm2
8829 getEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8830 // vmulpd xmm0,xmm1,xmm2
8831 getEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8832 // vmulps ymm0,ymm1,ymm2
8833 getEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8834 // vmulpd ymm0,ymm1,ymm2
8835 getEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8836 // vandps xmm0,xmm1,xmm2
8837 getEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8838 // vandpd xmm0,xmm1,xmm2
8839 getEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8840 // vandps ymm0,ymm1,ymm2
8841 getEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8842 // vandpd ymm0,ymm1,ymm2
8843 getEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8844 // vorps xmm0,xmm1,xmm2
8845 getEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8846 // vorpd xmm0,xmm1,xmm2
8847 getEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8848 // vorps ymm0,ymm1,ymm2
8849 getEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8850 // vorpd ymm0,ymm1,ymm2
8851 getEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8852 // vdivss xmm0,xmm1,xmm2
8853 getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8854 // vdivsd xmm0,xmm1,xmm2
8855 getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8856 // vdivss xmm0,xmm1,xmm2
8857 getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8858 // vdivsd xmm0,xmm1,xmm2
8859 getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8860
8861 // vdivss xmm0,xmm1,xmm2
8862 getEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8863 // vdivsd xmm0,xmm1,xmm2
8864 getEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8865#endif // ALL_XARCH_EMITTER_UNIT_TESTS
8866 printf("*************** End of genAmd64EmitterUnitTests()\n");
8867}
8868
8869#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
8870
8871#endif // _TARGET_AMD64_
8872