1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Lower XX
9XX XX
10XX Preconditions: XX
11XX XX
12XX Postconditions (for the nodes currently handled): XX
13XX - All operands requiring a register are explicit in the graph XX
14XX XX
15XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
17*/
18
19#include "jitpch.h"
20#ifdef _MSC_VER
21#pragma hdrstop
22#endif
23
24#include "lower.h"
25
26#if !defined(_TARGET_64BIT_)
27#include "decomposelongs.h"
28#endif // !defined(_TARGET_64BIT_)
29
30//------------------------------------------------------------------------
31// MakeSrcContained: Make "childNode" a contained node
32//
33// Arguments:
34// parentNode - is a non-leaf node that can contain its 'childNode'
35// childNode - is an op that will now be contained by its parent.
36//
37// Notes:
38// If 'childNode' it has any existing sources, they will now be sources for the parent.
39//
40void Lowering::MakeSrcContained(GenTree* parentNode, GenTree* childNode)
41{
42 assert(!parentNode->OperIsLeaf());
43 assert(childNode->canBeContained());
44 childNode->SetContained();
45 assert(childNode->isContained());
46}
47
48//------------------------------------------------------------------------
49// CheckImmedAndMakeContained: Checks if the 'childNode' is a containable immediate
50// and, if so, makes it contained.
51//
52// Arguments:
53// parentNode - is any non-leaf node
54// childNode - is an child op of 'parentNode'
55//
56// Return value:
57// true if we are able to make childNode a contained immediate
58//
59bool Lowering::CheckImmedAndMakeContained(GenTree* parentNode, GenTree* childNode)
60{
61 assert(!parentNode->OperIsLeaf());
62 // If childNode is a containable immediate
63 if (IsContainableImmed(parentNode, childNode))
64 {
65 // then make it contained within the parentNode
66 MakeSrcContained(parentNode, childNode);
67 return true;
68 }
69 return false;
70}
71
72//------------------------------------------------------------------------
73// IsSafeToContainMem: Checks for conflicts between childNode and parentNode,
74// and returns 'true' iff memory operand childNode can be contained in parentNode.
75//
76// Arguments:
77// parentNode - any non-leaf node
78// childNode - some node that is an input to `parentNode`
79//
80// Return value:
81// true if it is safe to make childNode a contained memory operand.
82//
83bool Lowering::IsSafeToContainMem(GenTree* parentNode, GenTree* childNode)
84{
85 m_scratchSideEffects.Clear();
86 m_scratchSideEffects.AddNode(comp, childNode);
87
88 for (GenTree* node = childNode->gtNext; node != parentNode; node = node->gtNext)
89 {
90 if (m_scratchSideEffects.InterferesWith(comp, node, false))
91 {
92 return false;
93 }
94 }
95
96 return true;
97}
98
99//------------------------------------------------------------------------
100
101// This is the main entry point for Lowering.
102GenTree* Lowering::LowerNode(GenTree* node)
103{
104 assert(node != nullptr);
105 switch (node->gtOper)
106 {
107 case GT_IND:
108 TryCreateAddrMode(LIR::Use(BlockRange(), &node->gtOp.gtOp1, node), true);
109 ContainCheckIndir(node->AsIndir());
110 break;
111
112 case GT_STOREIND:
113 TryCreateAddrMode(LIR::Use(BlockRange(), &node->gtOp.gtOp1, node), true);
114 if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(node))
115 {
116 LowerStoreIndir(node->AsIndir());
117 }
118 break;
119
120 case GT_ADD:
121 {
122 GenTree* afterTransform = LowerAdd(node);
123 if (afterTransform != nullptr)
124 {
125 return afterTransform;
126 }
127 __fallthrough;
128 }
129
130#if !defined(_TARGET_64BIT_)
131 case GT_ADD_LO:
132 case GT_ADD_HI:
133 case GT_SUB_LO:
134 case GT_SUB_HI:
135#endif
136 case GT_SUB:
137 case GT_AND:
138 case GT_OR:
139 case GT_XOR:
140 ContainCheckBinary(node->AsOp());
141 break;
142
143 case GT_MUL:
144 case GT_MULHI:
145#if defined(_TARGET_X86_)
146 case GT_MUL_LONG:
147#endif
148 ContainCheckMul(node->AsOp());
149 break;
150
151 case GT_UDIV:
152 case GT_UMOD:
153 if (!LowerUnsignedDivOrMod(node->AsOp()))
154 {
155 ContainCheckDivOrMod(node->AsOp());
156 }
157 break;
158
159 case GT_DIV:
160 case GT_MOD:
161 return LowerSignedDivOrMod(node);
162
163 case GT_SWITCH:
164 return LowerSwitch(node);
165
166 case GT_CALL:
167 LowerCall(node);
168 break;
169
170 case GT_LT:
171 case GT_LE:
172 case GT_GT:
173 case GT_GE:
174 case GT_EQ:
175 case GT_NE:
176 case GT_TEST_EQ:
177 case GT_TEST_NE:
178 case GT_CMP:
179 return LowerCompare(node);
180
181 case GT_JTRUE:
182 return LowerJTrue(node->AsOp());
183
184 case GT_JMP:
185 LowerJmpMethod(node);
186 break;
187
188 case GT_RETURN:
189 LowerRet(node);
190 break;
191
192 case GT_RETURNTRAP:
193 ContainCheckReturnTrap(node->AsOp());
194 break;
195
196 case GT_CAST:
197 LowerCast(node);
198 break;
199
200#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
201 case GT_ARR_BOUNDS_CHECK:
202#ifdef FEATURE_SIMD
203 case GT_SIMD_CHK:
204#endif // FEATURE_SIMD
205#ifdef FEATURE_HW_INTRINSICS
206 case GT_HW_INTRINSIC_CHK:
207#endif // FEATURE_HW_INTRINSICS
208 ContainCheckBoundsChk(node->AsBoundsChk());
209 break;
210#endif // _TARGET_XARCH_
211 case GT_ARR_ELEM:
212 return LowerArrElem(node);
213
214 case GT_ARR_OFFSET:
215 ContainCheckArrOffset(node->AsArrOffs());
216 break;
217
218 case GT_ROL:
219 case GT_ROR:
220 LowerRotate(node);
221 break;
222
223#ifndef _TARGET_64BIT_
224 case GT_LSH_HI:
225 case GT_RSH_LO:
226 ContainCheckShiftRotate(node->AsOp());
227 break;
228#endif // !_TARGET_64BIT_
229
230 case GT_LSH:
231 case GT_RSH:
232 case GT_RSZ:
233#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
234 LowerShift(node->AsOp());
235#else
236 ContainCheckShiftRotate(node->AsOp());
237#endif
238 break;
239
240 case GT_STORE_BLK:
241 case GT_STORE_OBJ:
242 case GT_STORE_DYN_BLK:
243 {
244 GenTreeBlk* blkNode = node->AsBlk();
245 TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false);
246 LowerBlockStore(blkNode);
247 }
248 break;
249
250 case GT_LCLHEAP:
251 ContainCheckLclHeap(node->AsOp());
252 break;
253
254#ifdef _TARGET_XARCH_
255 case GT_INTRINSIC:
256 ContainCheckIntrinsic(node->AsOp());
257 break;
258#endif // _TARGET_XARCH_
259
260#ifdef FEATURE_SIMD
261 case GT_SIMD:
262 LowerSIMD(node->AsSIMD());
263 break;
264#endif // FEATURE_SIMD
265
266#ifdef FEATURE_HW_INTRINSICS
267 case GT_HWIntrinsic:
268 LowerHWIntrinsic(node->AsHWIntrinsic());
269 break;
270#endif // FEATURE_HW_INTRINSICS
271
272 case GT_LCL_FLD:
273 {
274 // We should only encounter this for lclVars that are lvDoNotEnregister.
275 verifyLclFldDoNotEnregister(node->AsLclVarCommon()->gtLclNum);
276 break;
277 }
278
279 case GT_LCL_VAR:
280 WidenSIMD12IfNecessary(node->AsLclVarCommon());
281 break;
282
283 case GT_STORE_LCL_VAR:
284 WidenSIMD12IfNecessary(node->AsLclVarCommon());
285 __fallthrough;
286
287 case GT_STORE_LCL_FLD:
288 {
289#if defined(_TARGET_AMD64_) && defined(FEATURE_SIMD)
290 GenTreeLclVarCommon* const store = node->AsLclVarCommon();
291 if ((store->TypeGet() == TYP_SIMD8) != (store->gtOp1->TypeGet() == TYP_SIMD8))
292 {
293 GenTreeUnOp* bitcast =
294 new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, store->TypeGet(), store->gtOp1, nullptr);
295 store->gtOp1 = bitcast;
296 BlockRange().InsertBefore(store, bitcast);
297 }
298#endif // _TARGET_AMD64_
299 // TODO-1stClassStructs: Once we remove the requirement that all struct stores
300 // are block stores (GT_STORE_BLK or GT_STORE_OBJ), here is where we would put the local
301 // store under a block store if codegen will require it.
302 if ((node->TypeGet() == TYP_STRUCT) && (node->gtGetOp1()->OperGet() != GT_PHI))
303 {
304#if FEATURE_MULTIREG_RET
305 GenTree* src = node->gtGetOp1();
306 assert((src->OperGet() == GT_CALL) && src->AsCall()->HasMultiRegRetVal());
307#else // !FEATURE_MULTIREG_RET
308 assert(!"Unexpected struct local store in Lowering");
309#endif // !FEATURE_MULTIREG_RET
310 }
311 LowerStoreLoc(node->AsLclVarCommon());
312 break;
313 }
314
315#if defined(_TARGET_ARM64_)
316 case GT_CMPXCHG:
317 CheckImmedAndMakeContained(node, node->AsCmpXchg()->gtOpComparand);
318 break;
319
320 case GT_XADD:
321 CheckImmedAndMakeContained(node, node->gtOp.gtOp2);
322 break;
323#elif defined(_TARGET_XARCH_)
324 case GT_XADD:
325 if (node->IsUnusedValue())
326 {
327 node->ClearUnusedValue();
328 // Make sure the types are identical, since the node type is changed to VOID
329 // CodeGen relies on op2's type to determine the instruction size.
330 // Note that the node type cannot be a small int but the data operand can.
331 assert(genActualType(node->gtGetOp2()->TypeGet()) == node->TypeGet());
332 node->SetOper(GT_LOCKADD);
333 node->gtType = TYP_VOID;
334 CheckImmedAndMakeContained(node, node->gtGetOp2());
335 }
336 break;
337#endif
338
339#ifndef _TARGET_ARMARCH_
340 // TODO-ARMARCH-CQ: We should contain this as long as the offset fits.
341 case GT_OBJ:
342 if (node->AsObj()->Addr()->OperIsLocalAddr())
343 {
344 node->AsObj()->Addr()->SetContained();
345 }
346 break;
347#endif // !_TARGET_ARMARCH_
348
349 default:
350 break;
351 }
352
353 return node->gtNext;
354}
355
356/** -- Switch Lowering --
357 * The main idea of switch lowering is to keep transparency of the register requirements of this node
358 * downstream in LSRA. Given that the switch instruction is inherently a control statement which in the JIT
359 * is represented as a simple tree node, at the time we actually generate code for it we end up
360 * generating instructions that actually modify the flow of execution that imposes complicated
361 * register requirement and lifetimes.
362 *
363 * So, for the purpose of LSRA, we want to have a more detailed specification of what a switch node actually
364 * means and more importantly, which and when do we need a register for each instruction we want to issue
365 * to correctly allocate them downstream.
366 *
367 * For this purpose, this procedure performs switch lowering in two different ways:
368 *
369 * a) Represent the switch statement as a zero-index jump table construct. This means that for every destination
370 * of the switch, we will store this destination in an array of addresses and the code generator will issue
371 * a data section where this array will live and will emit code that based on the switch index, will indirect and
372 * jump to the destination specified in the jump table.
373 *
374 * For this transformation we introduce a new GT node called GT_SWITCH_TABLE that is a specialization of the switch
375 * node for jump table based switches.
376 * The overall structure of a GT_SWITCH_TABLE is:
377 *
378 * GT_SWITCH_TABLE
379 * |_________ localVar (a temporary local that holds the switch index)
380 * |_________ jumpTable (this is a special node that holds the address of the jump table array)
381 *
382 * Now, the way we morph a GT_SWITCH node into this lowered switch table node form is the following:
383 *
384 * Input: GT_SWITCH (inside a basic block whose Branch Type is BBJ_SWITCH)
385 * |_____ expr (an arbitrarily complex GT_NODE that represents the switch index)
386 *
387 * This gets transformed into the following statements inside a BBJ_COND basic block (the target would be
388 * the default case of the switch in case the conditional is evaluated to true).
389 *
390 * ----- original block, transformed
391 * GT_STORE_LCL_VAR tempLocal (a new temporary local variable used to store the switch index)
392 * |_____ expr (the index expression)
393 *
394 * GT_JTRUE
395 * |_____ GT_COND
396 * |_____ GT_GE
397 * |___ Int_Constant (This constant is the index of the default case
398 * that happens to be the highest index in the jump table).
399 * |___ tempLocal (The local variable were we stored the index expression).
400 *
401 * ----- new basic block
402 * GT_SWITCH_TABLE
403 * |_____ tempLocal
404 * |_____ jumpTable (a new jump table node that now LSRA can allocate registers for explicitly
405 * and LinearCodeGen will be responsible to generate downstream).
406 *
407 * This way there are no implicit temporaries.
408 *
409 * b) For small-sized switches, we will actually morph them into a series of conditionals of the form
410 * if (case falls into the default){ goto jumpTable[size]; // last entry in the jump table is the default case }
411 * (For the default case conditional, we'll be constructing the exact same code as the jump table case one).
412 * else if (case == firstCase){ goto jumpTable[1]; }
413 * else if (case == secondCase) { goto jumptable[2]; } and so on.
414 *
415 * This transformation is of course made in JIT-IR, not downstream to CodeGen level, so this way we no longer
416 * require internal temporaries to maintain the index we're evaluating plus we're using existing code from
417 * LinearCodeGen to implement this instead of implement all the control flow constructs using InstrDscs and
418 * InstrGroups downstream.
419 */
420
421GenTree* Lowering::LowerSwitch(GenTree* node)
422{
423 unsigned jumpCnt;
424 unsigned targetCnt;
425 BasicBlock** jumpTab;
426
427 assert(node->gtOper == GT_SWITCH);
428
429 // The first step is to build the default case conditional construct that is
430 // shared between both kinds of expansion of the switch node.
431
432 // To avoid confusion, we'll alias m_block to originalSwitchBB
433 // that represents the node we're morphing.
434 BasicBlock* originalSwitchBB = m_block;
435 LIR::Range& switchBBRange = LIR::AsRange(originalSwitchBB);
436
437 // jumpCnt is the number of elements in the jump table array.
438 // jumpTab is the actual pointer to the jump table array.
439 // targetCnt is the number of unique targets in the jump table array.
440 jumpCnt = originalSwitchBB->bbJumpSwt->bbsCount;
441 jumpTab = originalSwitchBB->bbJumpSwt->bbsDstTab;
442 targetCnt = originalSwitchBB->NumSucc(comp);
443
444// GT_SWITCH must be a top-level node with no use.
445#ifdef DEBUG
446 {
447 LIR::Use use;
448 assert(!switchBBRange.TryGetUse(node, &use));
449 }
450#endif
451
452 JITDUMP("Lowering switch " FMT_BB ", %d cases\n", originalSwitchBB->bbNum, jumpCnt);
453
454 // Handle a degenerate case: if the switch has only a default case, just convert it
455 // to an unconditional branch. This should only happen in minopts or with debuggable
456 // code.
457 if (targetCnt == 1)
458 {
459 JITDUMP("Lowering switch " FMT_BB ": single target; converting to BBJ_ALWAYS\n", originalSwitchBB->bbNum);
460 noway_assert(comp->opts.OptimizationDisabled());
461 if (originalSwitchBB->bbNext == jumpTab[0])
462 {
463 originalSwitchBB->bbJumpKind = BBJ_NONE;
464 originalSwitchBB->bbJumpDest = nullptr;
465 }
466 else
467 {
468 originalSwitchBB->bbJumpKind = BBJ_ALWAYS;
469 originalSwitchBB->bbJumpDest = jumpTab[0];
470 }
471 // Remove extra predecessor links if there was more than one case.
472 for (unsigned i = 1; i < jumpCnt; ++i)
473 {
474 (void)comp->fgRemoveRefPred(jumpTab[i], originalSwitchBB);
475 }
476
477 // We have to get rid of the GT_SWITCH node but a child might have side effects so just assign
478 // the result of the child subtree to a temp.
479 GenTree* rhs = node->gtOp.gtOp1;
480
481 unsigned lclNum = comp->lvaGrabTemp(true DEBUGARG("Lowering is creating a new local variable"));
482 comp->lvaTable[lclNum].lvType = rhs->TypeGet();
483
484 GenTreeLclVar* store =
485 new (comp, GT_STORE_LCL_VAR) GenTreeLclVar(GT_STORE_LCL_VAR, rhs->TypeGet(), lclNum, BAD_IL_OFFSET);
486 store->gtOp1 = rhs;
487 store->gtFlags = (rhs->gtFlags & GTF_COMMON_MASK);
488 store->gtFlags |= GTF_VAR_DEF;
489
490 switchBBRange.InsertAfter(node, store);
491 switchBBRange.Remove(node);
492
493 return store;
494 }
495
496 noway_assert(jumpCnt >= 2);
497
498 // Spill the argument to the switch node into a local so that it can be used later.
499 unsigned blockWeight = originalSwitchBB->getBBWeight(comp);
500
501 LIR::Use use(switchBBRange, &(node->gtOp.gtOp1), node);
502 ReplaceWithLclVar(use);
503
504 // GT_SWITCH(indexExpression) is now two statements:
505 // 1. a statement containing 'asg' (for temp = indexExpression)
506 // 2. and a statement with GT_SWITCH(temp)
507
508 assert(node->gtOper == GT_SWITCH);
509 GenTree* temp = node->gtOp.gtOp1;
510 assert(temp->gtOper == GT_LCL_VAR);
511 unsigned tempLclNum = temp->gtLclVarCommon.gtLclNum;
512 LclVarDsc* tempVarDsc = comp->lvaTable + tempLclNum;
513 var_types tempLclType = temp->TypeGet();
514
515 BasicBlock* defaultBB = jumpTab[jumpCnt - 1];
516 BasicBlock* followingBB = originalSwitchBB->bbNext;
517
518 /* Is the number of cases right for a test and jump switch? */
519 const bool fFirstCaseFollows = (followingBB == jumpTab[0]);
520 const bool fDefaultFollows = (followingBB == defaultBB);
521
522 unsigned minSwitchTabJumpCnt = 2; // table is better than just 2 cmp/jcc
523
524 // This means really just a single cmp/jcc (aka a simple if/else)
525 if (fFirstCaseFollows || fDefaultFollows)
526 {
527 minSwitchTabJumpCnt++;
528 }
529
530#if defined(_TARGET_ARM_)
531 // On ARM for small switch tables we will
532 // generate a sequence of compare and branch instructions
533 // because the code to load the base of the switch
534 // table is huge and hideous due to the relocation... :(
535 minSwitchTabJumpCnt += 2;
536#endif // _TARGET_ARM_
537
538 // Once we have the temporary variable, we construct the conditional branch for
539 // the default case. As stated above, this conditional is being shared between
540 // both GT_SWITCH lowering code paths.
541 // This condition is of the form: if (temp > jumpTableLength - 2){ goto jumpTable[jumpTableLength - 1]; }
542 GenTree* gtDefaultCaseCond = comp->gtNewOperNode(GT_GT, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
543 comp->gtNewIconNode(jumpCnt - 2, genActualType(tempLclType)));
544
545 // Make sure we perform an unsigned comparison, just in case the switch index in 'temp'
546 // is now less than zero 0 (that would also hit the default case).
547 gtDefaultCaseCond->gtFlags |= GTF_UNSIGNED;
548
549 GenTree* gtDefaultCaseJump = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtDefaultCaseCond);
550 gtDefaultCaseJump->gtFlags = node->gtFlags;
551
552 LIR::Range condRange = LIR::SeqTree(comp, gtDefaultCaseJump);
553 switchBBRange.InsertAtEnd(std::move(condRange));
554
555 BasicBlock* afterDefaultCondBlock = comp->fgSplitBlockAfterNode(originalSwitchBB, condRange.LastNode());
556
557 // afterDefaultCondBlock is now the switch, and all the switch targets have it as a predecessor.
558 // originalSwitchBB is now a BBJ_NONE, and there is a predecessor edge in afterDefaultCondBlock
559 // representing the fall-through flow from originalSwitchBB.
560 assert(originalSwitchBB->bbJumpKind == BBJ_NONE);
561 assert(originalSwitchBB->bbNext == afterDefaultCondBlock);
562 assert(afterDefaultCondBlock->bbJumpKind == BBJ_SWITCH);
563 assert(afterDefaultCondBlock->bbJumpSwt->bbsHasDefault);
564 assert(afterDefaultCondBlock->isEmpty()); // Nothing here yet.
565
566 // The GT_SWITCH code is still in originalSwitchBB (it will be removed later).
567
568 // Turn originalSwitchBB into a BBJ_COND.
569 originalSwitchBB->bbJumpKind = BBJ_COND;
570 originalSwitchBB->bbJumpDest = jumpTab[jumpCnt - 1];
571
572 // Fix the pred for the default case: the default block target still has originalSwitchBB
573 // as a predecessor, but the fgSplitBlockAfterStatement() moved all predecessors to point
574 // to afterDefaultCondBlock.
575 flowList* oldEdge = comp->fgRemoveRefPred(jumpTab[jumpCnt - 1], afterDefaultCondBlock);
576 comp->fgAddRefPred(jumpTab[jumpCnt - 1], originalSwitchBB, oldEdge);
577
578 bool useJumpSequence = jumpCnt < minSwitchTabJumpCnt;
579
580#if defined(_TARGET_UNIX_) && defined(_TARGET_ARM_)
581 // Force using an inlined jumping instead switch table generation.
582 // Switch jump table is generated with incorrect values in CoreRT case,
583 // so any large switch will crash after loading to PC any such value.
584 // I think this is due to the fact that we use absolute addressing
585 // instead of relative. But in CoreRT is used as a rule relative
586 // addressing when we generate an executable.
587 // See also https://github.com/dotnet/coreclr/issues/13194
588 // Also https://github.com/dotnet/coreclr/pull/13197
589 useJumpSequence = useJumpSequence || comp->IsTargetAbi(CORINFO_CORERT_ABI);
590#endif // defined(_TARGET_UNIX_) && defined(_TARGET_ARM_)
591
592 // If we originally had 2 unique successors, check to see whether there is a unique
593 // non-default case, in which case we can eliminate the switch altogether.
594 // Note that the single unique successor case is handled above.
595 BasicBlock* uniqueSucc = nullptr;
596 if (targetCnt == 2)
597 {
598 uniqueSucc = jumpTab[0];
599 noway_assert(jumpCnt >= 2);
600 for (unsigned i = 1; i < jumpCnt - 1; i++)
601 {
602 if (jumpTab[i] != uniqueSucc)
603 {
604 uniqueSucc = nullptr;
605 break;
606 }
607 }
608 }
609 if (uniqueSucc != nullptr)
610 {
611 // If the unique successor immediately follows this block, we have nothing to do -
612 // it will simply fall-through after we remove the switch, below.
613 // Otherwise, make this a BBJ_ALWAYS.
614 // Now, fixup the predecessor links to uniqueSucc. In the original jumpTab:
615 // jumpTab[i-1] was the default target, which we handled above,
616 // jumpTab[0] is the first target, and we'll leave that predecessor link.
617 // Remove any additional predecessor links to uniqueSucc.
618 for (unsigned i = 1; i < jumpCnt - 1; ++i)
619 {
620 assert(jumpTab[i] == uniqueSucc);
621 (void)comp->fgRemoveRefPred(uniqueSucc, afterDefaultCondBlock);
622 }
623 if (afterDefaultCondBlock->bbNext == uniqueSucc)
624 {
625 afterDefaultCondBlock->bbJumpKind = BBJ_NONE;
626 afterDefaultCondBlock->bbJumpDest = nullptr;
627 }
628 else
629 {
630 afterDefaultCondBlock->bbJumpKind = BBJ_ALWAYS;
631 afterDefaultCondBlock->bbJumpDest = uniqueSucc;
632 }
633 }
634 // If the number of possible destinations is small enough, we proceed to expand the switch
635 // into a series of conditional branches, otherwise we follow the jump table based switch
636 // transformation.
637 else if (useJumpSequence || comp->compStressCompile(Compiler::STRESS_SWITCH_CMP_BR_EXPANSION, 50))
638 {
639 // Lower the switch into a series of compare and branch IR trees.
640 //
641 // In this case we will morph the node in the following way:
642 // 1. Generate a JTRUE statement to evaluate the default case. (This happens above.)
643 // 2. Start splitting the switch basic block into subsequent basic blocks, each of which will contain
644 // a statement that is responsible for performing a comparison of the table index and conditional
645 // branch if equal.
646
647 JITDUMP("Lowering switch " FMT_BB ": using compare/branch expansion\n", originalSwitchBB->bbNum);
648
649 // We'll use 'afterDefaultCondBlock' for the first conditional. After that, we'll add new
650 // blocks. If we end up not needing it at all (say, if all the non-default cases just fall through),
651 // we'll delete it.
652 bool fUsedAfterDefaultCondBlock = false;
653 BasicBlock* currentBlock = afterDefaultCondBlock;
654 LIR::Range* currentBBRange = &LIR::AsRange(currentBlock);
655
656 // Walk to entries 0 to jumpCnt - 1. If a case target follows, ignore it and let it fall through.
657 // If no case target follows, the last one doesn't need to be a compare/branch: it can be an
658 // unconditional branch.
659 bool fAnyTargetFollows = false;
660 for (unsigned i = 0; i < jumpCnt - 1; ++i)
661 {
662 assert(currentBlock != nullptr);
663
664 // Remove the switch from the predecessor list of this case target's block.
665 // We'll add the proper new predecessor edge later.
666 flowList* oldEdge = comp->fgRemoveRefPred(jumpTab[i], afterDefaultCondBlock);
667
668 if (jumpTab[i] == followingBB)
669 {
670 // This case label follows the switch; let it fall through.
671 fAnyTargetFollows = true;
672 continue;
673 }
674
675 // We need a block to put in the new compare and/or branch.
676 // If we haven't used the afterDefaultCondBlock yet, then use that.
677 if (fUsedAfterDefaultCondBlock)
678 {
679 BasicBlock* newBlock = comp->fgNewBBafter(BBJ_NONE, currentBlock, true);
680 comp->fgAddRefPred(newBlock, currentBlock); // The fall-through predecessor.
681 currentBlock = newBlock;
682 currentBBRange = &LIR::AsRange(currentBlock);
683 }
684 else
685 {
686 assert(currentBlock == afterDefaultCondBlock);
687 fUsedAfterDefaultCondBlock = true;
688 }
689
690 // We're going to have a branch, either a conditional or unconditional,
691 // to the target. Set the target.
692 currentBlock->bbJumpDest = jumpTab[i];
693
694 // Wire up the predecessor list for the "branch" case.
695 comp->fgAddRefPred(jumpTab[i], currentBlock, oldEdge);
696
697 if (!fAnyTargetFollows && (i == jumpCnt - 2))
698 {
699 // We're processing the last one, and there is no fall through from any case
700 // to the following block, so we can use an unconditional branch to the final
701 // case: there is no need to compare against the case index, since it's
702 // guaranteed to be taken (since the default case was handled first, above).
703
704 currentBlock->bbJumpKind = BBJ_ALWAYS;
705 }
706 else
707 {
708 // Otherwise, it's a conditional branch. Set the branch kind, then add the
709 // condition statement.
710 currentBlock->bbJumpKind = BBJ_COND;
711
712 // Now, build the conditional statement for the current case that is
713 // being evaluated:
714 // GT_JTRUE
715 // |__ GT_COND
716 // |____GT_EQ
717 // |____ (switchIndex) (The temp variable)
718 // |____ (ICon) (The actual case constant)
719 GenTree* gtCaseCond = comp->gtNewOperNode(GT_EQ, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
720 comp->gtNewIconNode(i, tempLclType));
721 GenTree* gtCaseBranch = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtCaseCond);
722 LIR::Range caseRange = LIR::SeqTree(comp, gtCaseBranch);
723 currentBBRange->InsertAtEnd(std::move(caseRange));
724 }
725 }
726
727 if (fAnyTargetFollows)
728 {
729 // There is a fall-through to the following block. In the loop
730 // above, we deleted all the predecessor edges from the switch.
731 // In this case, we need to add one back.
732 comp->fgAddRefPred(currentBlock->bbNext, currentBlock);
733 }
734
735 if (!fUsedAfterDefaultCondBlock)
736 {
737 // All the cases were fall-through! We don't need this block.
738 // Convert it from BBJ_SWITCH to BBJ_NONE and unset the BBF_DONT_REMOVE flag
739 // so fgRemoveBlock() doesn't complain.
740 JITDUMP("Lowering switch " FMT_BB ": all switch cases were fall-through\n", originalSwitchBB->bbNum);
741 assert(currentBlock == afterDefaultCondBlock);
742 assert(currentBlock->bbJumpKind == BBJ_SWITCH);
743 currentBlock->bbJumpKind = BBJ_NONE;
744 currentBlock->bbFlags &= ~BBF_DONT_REMOVE;
745 comp->fgRemoveBlock(currentBlock, /* unreachable */ false); // It's an empty block.
746 }
747 }
748 else
749 {
750 // At this point the default case has already been handled and we need to generate a jump
751 // table based switch or a bit test based switch at the end of afterDefaultCondBlock. Both
752 // switch variants need the switch value so create the necessary LclVar node here.
753 GenTree* switchValue = comp->gtNewLclvNode(tempLclNum, tempLclType);
754 LIR::Range& switchBlockRange = LIR::AsRange(afterDefaultCondBlock);
755 switchBlockRange.InsertAtEnd(switchValue);
756
757 // Try generating a bit test based switch first,
758 // if that's not possible a jump table based switch will be generated.
759 if (!TryLowerSwitchToBitTest(jumpTab, jumpCnt, targetCnt, afterDefaultCondBlock, switchValue))
760 {
761 JITDUMP("Lowering switch " FMT_BB ": using jump table expansion\n", originalSwitchBB->bbNum);
762
763#ifdef _TARGET_64BIT_
764 if (tempLclType != TYP_I_IMPL)
765 {
766 // SWITCH_TABLE expects the switch value (the index into the jump table) to be TYP_I_IMPL.
767 // Note that the switch value is unsigned so the cast should be unsigned as well.
768 switchValue = comp->gtNewCastNode(TYP_I_IMPL, switchValue, true, TYP_U_IMPL);
769 switchBlockRange.InsertAtEnd(switchValue);
770 }
771#endif
772
773 GenTree* switchTable = comp->gtNewJmpTableNode();
774 GenTree* switchJump = comp->gtNewOperNode(GT_SWITCH_TABLE, TYP_VOID, switchValue, switchTable);
775 switchBlockRange.InsertAfter(switchValue, switchTable, switchJump);
776
777 // this block no longer branches to the default block
778 afterDefaultCondBlock->bbJumpSwt->removeDefault();
779 }
780
781 comp->fgInvalidateSwitchDescMapEntry(afterDefaultCondBlock);
782 }
783
784 GenTree* next = node->gtNext;
785
786 // Get rid of the GT_SWITCH(temp).
787 switchBBRange.Remove(node->gtOp.gtOp1);
788 switchBBRange.Remove(node);
789
790 return next;
791}
792
793//------------------------------------------------------------------------
794// TryLowerSwitchToBitTest: Attempts to transform a jump table switch into a bit test.
795//
796// Arguments:
797// jumpTable - The jump table
798// jumpCount - The number of blocks in the jump table
799// targetCount - The number of distinct blocks in the jump table
800// bbSwitch - The switch block
801// switchValue - A LclVar node that provides the switch value
802//
803// Return value:
804// true if the switch has been lowered to a bit test
805//
806// Notes:
807// If the jump table contains less than 32 (64 on 64 bit targets) entries and there
808// are at most 2 distinct jump targets then the jump table can be converted to a word
809// of bits where a 0 bit corresponds to one jump target and a 1 bit corresponds to the
810// other jump target. Instead of the indirect jump a BT-JCC sequence is used to jump
811// to the appropriate target:
812// mov eax, 245 ; jump table converted to a "bit table"
813// bt eax, ebx ; ebx is supposed to contain the switch value
814// jc target1
815// target0:
816// ...
817// target1:
818// Such code is both shorter and faster (in part due to the removal of a memory load)
819// than the traditional jump table base code. And of course, it also avoids the need
820// to emit the jump table itself that can reach up to 256 bytes (for 64 entries).
821//
822bool Lowering::TryLowerSwitchToBitTest(
823 BasicBlock* jumpTable[], unsigned jumpCount, unsigned targetCount, BasicBlock* bbSwitch, GenTree* switchValue)
824{
825#ifndef _TARGET_XARCH_
826 // Other architectures may use this if they substitute GT_BT with equivalent code.
827 return false;
828#else
829 assert(jumpCount >= 2);
830 assert(targetCount >= 2);
831 assert(bbSwitch->bbJumpKind == BBJ_SWITCH);
832 assert(switchValue->OperIs(GT_LCL_VAR));
833
834 //
835 // Quick check to see if it's worth going through the jump table. The bit test switch supports
836 // up to 2 targets but targetCount also includes the default block so we need to allow 3 targets.
837 // We'll ensure that there are only 2 targets when building the bit table.
838 //
839
840 if (targetCount > 3)
841 {
842 return false;
843 }
844
845 //
846 // The number of bits in the bit table is the same as the number of jump table entries. But the
847 // jump table also includes the default target (at the end) so we need to ignore it. The default
848 // has already been handled by a JTRUE(GT(switchValue, jumpCount - 2)) that LowerSwitch generates.
849 //
850
851 const unsigned bitCount = jumpCount - 1;
852
853 if (bitCount > (genTypeSize(TYP_I_IMPL) * 8))
854 {
855 return false;
856 }
857
858 //
859 // Build a bit table where a bit set to 0 corresponds to bbCase0 and a bit set to 1 corresponds to
860 // bbCase1. Simply use the first block in the jump table as bbCase1, later we can invert the bit
861 // table and/or swap the blocks if it's beneficial.
862 //
863
864 BasicBlock* bbCase0 = nullptr;
865 BasicBlock* bbCase1 = jumpTable[0];
866 size_t bitTable = 1;
867
868 for (unsigned bitIndex = 1; bitIndex < bitCount; bitIndex++)
869 {
870 if (jumpTable[bitIndex] == bbCase1)
871 {
872 bitTable |= (size_t(1) << bitIndex);
873 }
874 else if (bbCase0 == nullptr)
875 {
876 bbCase0 = jumpTable[bitIndex];
877 }
878 else if (jumpTable[bitIndex] != bbCase0)
879 {
880 // If it's neither bbCase0 nor bbCase1 then it means we have 3 targets. There can't be more
881 // than 3 because of the check at the start of the function.
882 assert(targetCount == 3);
883 return false;
884 }
885 }
886
887 //
888 // One of the case blocks has to follow the switch block. This requirement could be avoided
889 // by adding a BBJ_ALWAYS block after the switch block but doing that sometimes negatively
890 // impacts register allocation.
891 //
892
893 if ((bbSwitch->bbNext != bbCase0) && (bbSwitch->bbNext != bbCase1))
894 {
895 return false;
896 }
897
898#ifdef _TARGET_64BIT_
899 //
900 // See if we can avoid a 8 byte immediate on 64 bit targets. If all upper 32 bits are 1
901 // then inverting the bit table will make them 0 so that the table now fits in 32 bits.
902 // Note that this does not change the number of bits in the bit table, it just takes
903 // advantage of the fact that loading a 32 bit immediate into a 64 bit register zero
904 // extends the immediate value to 64 bit.
905 //
906
907 if (~bitTable <= UINT32_MAX)
908 {
909 bitTable = ~bitTable;
910 std::swap(bbCase0, bbCase1);
911 }
912#endif
913
914 //
915 // Rewire the blocks as needed and figure out the condition to use for JCC.
916 //
917
918 genTreeOps bbSwitchCondition = GT_NONE;
919 bbSwitch->bbJumpKind = BBJ_COND;
920
921 comp->fgRemoveAllRefPreds(bbCase1, bbSwitch);
922 comp->fgRemoveAllRefPreds(bbCase0, bbSwitch);
923
924 if (bbSwitch->bbNext == bbCase0)
925 {
926 // GT_LT + GTF_UNSIGNED generates JC so we jump to bbCase1 when the bit is set
927 bbSwitchCondition = GT_LT;
928 bbSwitch->bbJumpDest = bbCase1;
929
930 comp->fgAddRefPred(bbCase0, bbSwitch);
931 comp->fgAddRefPred(bbCase1, bbSwitch);
932 }
933 else
934 {
935 assert(bbSwitch->bbNext == bbCase1);
936
937 // GT_GE + GTF_UNSIGNED generates JNC so we jump to bbCase0 when the bit is not set
938 bbSwitchCondition = GT_GE;
939 bbSwitch->bbJumpDest = bbCase0;
940
941 comp->fgAddRefPred(bbCase0, bbSwitch);
942 comp->fgAddRefPred(bbCase1, bbSwitch);
943 }
944
945 //
946 // Append BT(bitTable, switchValue) and JCC(condition) to the switch block.
947 //
948
949 var_types bitTableType = (bitCount <= (genTypeSize(TYP_INT) * 8)) ? TYP_INT : TYP_LONG;
950 GenTree* bitTableIcon = comp->gtNewIconNode(bitTable, bitTableType);
951 GenTree* bitTest = comp->gtNewOperNode(GT_BT, TYP_VOID, bitTableIcon, switchValue);
952 bitTest->gtFlags |= GTF_SET_FLAGS;
953 GenTreeCC* jcc = new (comp, GT_JCC) GenTreeCC(GT_JCC, bbSwitchCondition);
954 jcc->gtFlags |= GTF_UNSIGNED | GTF_USE_FLAGS;
955
956 LIR::AsRange(bbSwitch).InsertAfter(switchValue, bitTableIcon, bitTest, jcc);
957
958 return true;
959#endif // _TARGET_XARCH_
960}
961
962// NOTE: this method deliberately does not update the call arg table. It must only
963// be used by NewPutArg and LowerArg; these functions are responsible for updating
964// the call arg table as necessary.
965void Lowering::ReplaceArgWithPutArgOrBitcast(GenTree** argSlot, GenTree* putArgOrBitcast)
966{
967 assert(argSlot != nullptr);
968 assert(*argSlot != nullptr);
969 assert(putArgOrBitcast->OperIsPutArg() || putArgOrBitcast->OperIs(GT_BITCAST));
970
971 GenTree* arg = *argSlot;
972
973 // Replace the argument with the putarg/copy
974 *argSlot = putArgOrBitcast;
975 putArgOrBitcast->gtOp.gtOp1 = arg;
976
977 // Insert the putarg/copy into the block
978 BlockRange().InsertAfter(arg, putArgOrBitcast);
979}
980
981//------------------------------------------------------------------------
982// NewPutArg: rewrites the tree to put an arg in a register or on the stack.
983//
984// Arguments:
985// call - the call whose arg is being rewritten.
986// arg - the arg being rewritten.
987// info - the fgArgTabEntry information for the argument.
988// type - the type of the argument.
989//
990// Return Value:
991// The new tree that was created to put the arg in the right place
992// or the incoming arg if the arg tree was not rewritten.
993//
994// Assumptions:
995// call, arg, and info must be non-null.
996//
997// Notes:
998// For System V systems with native struct passing (i.e. UNIX_AMD64_ABI defined)
999// this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs
1000// for two eightbyte structs.
1001//
1002// For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
1003// (i.e. UNIX_AMD64_ABI defined) this method also sets the GC pointers count and the pointers
1004// layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
1005// (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
1006//
1007GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* info, var_types type)
1008{
1009 assert(call != nullptr);
1010 assert(arg != nullptr);
1011 assert(info != nullptr);
1012
1013 GenTree* putArg = nullptr;
1014 bool updateArgTable = true;
1015
1016 bool isOnStack = true;
1017 isOnStack = info->regNum == REG_STK;
1018
1019#ifdef _TARGET_ARMARCH_
1020 // Mark contained when we pass struct
1021 // GT_FIELD_LIST is always marked contained when it is generated
1022 if (type == TYP_STRUCT)
1023 {
1024 arg->SetContained();
1025 if ((arg->OperGet() == GT_OBJ) && (arg->AsObj()->Addr()->OperGet() == GT_LCL_VAR_ADDR))
1026 {
1027 MakeSrcContained(arg, arg->AsObj()->Addr());
1028 }
1029 }
1030#endif
1031
1032#if FEATURE_ARG_SPLIT
1033 // Struct can be split into register(s) and stack on ARM
1034 if (info->isSplit)
1035 {
1036 assert(arg->OperGet() == GT_OBJ || arg->OperGet() == GT_FIELD_LIST);
1037 // TODO: Need to check correctness for FastTailCall
1038 if (call->IsFastTailCall())
1039 {
1040#ifdef _TARGET_ARM_
1041 NYI_ARM("lower: struct argument by fast tail call");
1042#endif // _TARGET_ARM_
1043 }
1044
1045 putArg = new (comp, GT_PUTARG_SPLIT)
1046 GenTreePutArgSplit(arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots), info->numRegs,
1047 call->IsFastTailCall(), call);
1048
1049 // If struct argument is morphed to GT_FIELD_LIST node(s),
1050 // we can know GC info by type of each GT_FIELD_LIST node.
1051 // So we skip setting GC Pointer info.
1052 //
1053 GenTreePutArgSplit* argSplit = putArg->AsPutArgSplit();
1054 for (unsigned regIndex = 0; regIndex < info->numRegs; regIndex++)
1055 {
1056 argSplit->SetRegNumByIdx(info->getRegNum(regIndex), regIndex);
1057 }
1058
1059 if (arg->OperGet() == GT_OBJ)
1060 {
1061 BYTE* gcLayout = nullptr;
1062 unsigned numRefs = 0;
1063 GenTreeObj* argObj = arg->AsObj();
1064
1065 if (argObj->IsGCInfoInitialized())
1066 {
1067 gcLayout = argObj->gtGcPtrs;
1068 numRefs = argObj->GetGcPtrCount();
1069 }
1070 else
1071 {
1072 // Set GC Pointer info
1073 gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots + info->numRegs];
1074 numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
1075 argSplit->setGcPointers(numRefs, gcLayout);
1076 }
1077
1078 // Set type of registers
1079 for (unsigned index = 0; index < info->numRegs; index++)
1080 {
1081 var_types regType = comp->getJitGCType(gcLayout[index]);
1082 // Account for the possibility that float fields may be passed in integer registers.
1083 if (varTypeIsFloating(regType) && !genIsValidFloatReg(argSplit->GetRegNumByIdx(index)))
1084 {
1085 regType = (regType == TYP_FLOAT) ? TYP_INT : TYP_LONG;
1086 }
1087 argSplit->m_regType[index] = regType;
1088 }
1089 }
1090 else
1091 {
1092 GenTreeFieldList* fieldListPtr = arg->AsFieldList();
1093 for (unsigned index = 0; index < info->numRegs; fieldListPtr = fieldListPtr->Rest(), index++)
1094 {
1095 var_types regType = fieldListPtr->gtGetOp1()->TypeGet();
1096 // Account for the possibility that float fields may be passed in integer registers.
1097 if (varTypeIsFloating(regType) && !genIsValidFloatReg(argSplit->GetRegNumByIdx(index)))
1098 {
1099 regType = (regType == TYP_FLOAT) ? TYP_INT : TYP_LONG;
1100 }
1101 argSplit->m_regType[index] = regType;
1102
1103 // Clear the register assignments on the fieldList nodes, as these are contained.
1104 fieldListPtr->gtRegNum = REG_NA;
1105 }
1106 }
1107 }
1108 else
1109#endif // FEATURE_ARG_SPLIT
1110 {
1111 if (!isOnStack)
1112 {
1113#if FEATURE_MULTIREG_ARGS
1114 if ((info->numRegs > 1) && (arg->OperGet() == GT_FIELD_LIST))
1115 {
1116 assert(arg->OperGet() == GT_FIELD_LIST);
1117
1118 assert(arg->AsFieldList()->IsFieldListHead());
1119 unsigned int regIndex = 0;
1120 for (GenTreeFieldList* fieldListPtr = arg->AsFieldList(); fieldListPtr != nullptr;
1121 fieldListPtr = fieldListPtr->Rest())
1122 {
1123 regNumber argReg = info->getRegNum(regIndex);
1124 GenTree* curOp = fieldListPtr->gtOp.gtOp1;
1125 var_types curTyp = curOp->TypeGet();
1126
1127 // Create a new GT_PUTARG_REG node with op1
1128 GenTree* newOper = comp->gtNewPutArgReg(curTyp, curOp, argReg);
1129
1130 // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST
1131 ReplaceArgWithPutArgOrBitcast(&fieldListPtr->gtOp.gtOp1, newOper);
1132 regIndex++;
1133
1134 // Initialize all the gtRegNum's since the list won't be traversed in an LIR traversal.
1135 fieldListPtr->gtRegNum = REG_NA;
1136 }
1137
1138 // Just return arg. The GT_FIELD_LIST is not replaced.
1139 // Nothing more to do.
1140 return arg;
1141 }
1142 else
1143#endif // FEATURE_MULTIREG_ARGS
1144 {
1145 putArg = comp->gtNewPutArgReg(type, arg, info->regNum);
1146 }
1147 }
1148 else
1149 {
1150 // Mark this one as tail call arg if it is a fast tail call.
1151 // This provides the info to put this argument in in-coming arg area slot
1152 // instead of in out-going arg area slot.
1153
1154 // Make sure state is correct. The PUTARG_STK has TYP_VOID, as it doesn't produce
1155 // a result. So the type of its operand must be the correct type to push on the stack.
1156 // For a FIELD_LIST, this will be the type of the field (not the type of the arg),
1157 // but otherwise it is generally the type of the operand.
1158 info->checkIsStruct();
1159 if ((arg->OperGet() != GT_FIELD_LIST))
1160 {
1161#if defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1162 if (type == TYP_SIMD12)
1163 {
1164 assert(info->numSlots == 3);
1165 }
1166 else
1167#endif // defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1168 {
1169 assert(genActualType(arg->TypeGet()) == type);
1170 }
1171 }
1172
1173 putArg =
1174 new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg,
1175 info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots),
1176 call->IsFastTailCall(), call);
1177
1178#ifdef FEATURE_PUT_STRUCT_ARG_STK
1179 // If the ArgTabEntry indicates that this arg is a struct
1180 // get and store the number of slots that are references.
1181 // This is later used in the codegen for PUT_ARG_STK implementation
1182 // for struct to decide whether and how many single eight-byte copies
1183 // to be done (only for reference slots), so gcinfo is emitted.
1184 // For non-reference slots faster/smaller size instructions are used -
1185 // pair copying using XMM registers or rep mov instructions.
1186 if (info->isStruct)
1187 {
1188 // We use GT_OBJ only for non-lclVar, non-SIMD, non-FIELD_LIST struct arguments.
1189 if (arg->OperIsLocal())
1190 {
1191 // This must have a type with a known size (SIMD or has been morphed to a primitive type).
1192 assert(arg->TypeGet() != TYP_STRUCT);
1193 }
1194 else if (arg->OperIs(GT_OBJ))
1195 {
1196 unsigned numRefs = 0;
1197 BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
1198 assert(!varTypeIsSIMD(arg));
1199 numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
1200 putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
1201
1202#ifdef _TARGET_X86_
1203 // On x86 VM lies about the type of a struct containing a pointer sized
1204 // integer field by returning the type of its field as the type of struct.
1205 // Such struct can be passed in a register depending its position in
1206 // parameter list. VM does this unwrapping only one level and therefore
1207 // a type like Struct Foo { Struct Bar { int f}} awlays needs to be
1208 // passed on stack. Also, VM doesn't lie about type of such a struct
1209 // when it is a field of another struct. That is VM doesn't lie about
1210 // the type of Foo.Bar
1211 //
1212 // We now support the promotion of fields that are of type struct.
1213 // However we only support a limited case where the struct field has a
1214 // single field and that single field must be a scalar type. Say Foo.Bar
1215 // field is getting passed as a parameter to a call, Since it is a TYP_STRUCT,
1216 // as per x86 ABI it should always be passed on stack. Therefore GenTree
1217 // node under a PUTARG_STK could be GT_OBJ(GT_LCL_VAR_ADDR(v1)), where
1218 // local v1 could be a promoted field standing for Foo.Bar. Note that
1219 // the type of v1 will be the type of field of Foo.Bar.f when Foo is
1220 // promoted. That is v1 will be a scalar type. In this case we need to
1221 // pass v1 on stack instead of in a register.
1222 //
1223 // TODO-PERF: replace GT_OBJ(GT_LCL_VAR_ADDR(v1)) with v1 if v1 is
1224 // a scalar type and the width of GT_OBJ matches the type size of v1.
1225 // Note that this cannot be done till call node arguments are morphed
1226 // because we should not lose the fact that the type of argument is
1227 // a struct so that the arg gets correctly marked to be passed on stack.
1228 GenTree* objOp1 = arg->gtGetOp1();
1229 if (objOp1->OperGet() == GT_LCL_VAR_ADDR)
1230 {
1231 unsigned lclNum = objOp1->AsLclVarCommon()->GetLclNum();
1232 if (comp->lvaTable[lclNum].lvType != TYP_STRUCT)
1233 {
1234 comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(Compiler::DNER_VMNeedsStackAddr));
1235 }
1236 }
1237#endif // _TARGET_X86_
1238 }
1239 else if (!arg->OperIs(GT_FIELD_LIST))
1240 {
1241 assert(varTypeIsSIMD(arg) || (info->numSlots == 1));
1242 }
1243 }
1244#endif // FEATURE_PUT_STRUCT_ARG_STK
1245 }
1246 }
1247
1248 JITDUMP("new node is : ");
1249 DISPNODE(putArg);
1250 JITDUMP("\n");
1251
1252 if (arg->gtFlags & GTF_LATE_ARG)
1253 {
1254 putArg->gtFlags |= GTF_LATE_ARG;
1255 }
1256 else if (updateArgTable)
1257 {
1258 info->node = putArg;
1259 }
1260 return putArg;
1261}
1262
1263//------------------------------------------------------------------------
1264// LowerArg: Lower one argument of a call. This entails splicing a "putarg" node between
1265// the argument evaluation and the call. This is the point at which the source is
1266// consumed and the value transitions from control of the register allocator to the calling
1267// convention.
1268//
1269// Arguments:
1270// call - The call node
1271// ppArg - Pointer to the call argument pointer. We might replace the call argument by
1272// changing *ppArg.
1273//
1274// Return Value:
1275// None.
1276//
1277void Lowering::LowerArg(GenTreeCall* call, GenTree** ppArg)
1278{
1279 GenTree* arg = *ppArg;
1280
1281 JITDUMP("lowering arg : ");
1282 DISPNODE(arg);
1283
1284 // No assignments should remain by Lowering.
1285 assert(!arg->OperIs(GT_ASG));
1286 assert(!arg->OperIsPutArgStk());
1287
1288 // Assignments/stores at this level are not really placing an argument.
1289 // They are setting up temporary locals that will later be placed into
1290 // outgoing regs or stack.
1291 // Note that atomic ops may be stores and still produce a value.
1292 if (!arg->IsValue())
1293 {
1294 assert((arg->OperIsStore() && !arg->IsValue()) || arg->IsArgPlaceHolderNode() || arg->IsNothingNode() ||
1295 arg->OperIsCopyBlkOp());
1296 return;
1297 }
1298
1299 fgArgTabEntry* info = comp->gtArgEntryByNode(call, arg);
1300 assert(info->node == arg);
1301 var_types type = arg->TypeGet();
1302
1303 if (varTypeIsSmall(type))
1304 {
1305 // Normalize 'type', it represents the item that we will be storing in the Outgoing Args
1306 type = TYP_INT;
1307 }
1308
1309#if defined(FEATURE_SIMD)
1310#if defined(_TARGET_X86_)
1311 // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
1312 // allocated size (see lvSize()). However, when passing the variables as arguments, and
1313 // storing the variables to the outgoing argument area on the stack, we must use their
1314 // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
1315 if (type == TYP_SIMD16)
1316 {
1317 if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
1318 {
1319 unsigned varNum = arg->AsLclVarCommon()->GetLclNum();
1320 LclVarDsc* varDsc = &comp->lvaTable[varNum];
1321 type = varDsc->lvType;
1322 }
1323 else if (arg->OperGet() == GT_SIMD)
1324 {
1325 assert((arg->AsSIMD()->gtSIMDSize == 16) || (arg->AsSIMD()->gtSIMDSize == 12));
1326
1327 if (arg->AsSIMD()->gtSIMDSize == 12)
1328 {
1329 type = TYP_SIMD12;
1330 }
1331 }
1332 }
1333#elif defined(_TARGET_AMD64_)
1334 // TYP_SIMD8 parameters that are passed as longs
1335 if (type == TYP_SIMD8 && genIsValidIntReg(info->regNum))
1336 {
1337 GenTreeUnOp* bitcast = new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, TYP_LONG, arg, nullptr);
1338 BlockRange().InsertAfter(arg, bitcast);
1339
1340 info->node = *ppArg = arg = bitcast;
1341 type = TYP_LONG;
1342 }
1343#endif // defined(_TARGET_X86_)
1344#endif // defined(FEATURE_SIMD)
1345
1346 // If we hit this we are probably double-lowering.
1347 assert(!arg->OperIsPutArg());
1348
1349#if !defined(_TARGET_64BIT_)
1350 if (varTypeIsLong(type))
1351 {
1352 bool isReg = (info->regNum != REG_STK);
1353 if (isReg)
1354 {
1355 noway_assert(arg->OperGet() == GT_LONG);
1356 assert(info->numRegs == 2);
1357
1358 GenTree* argLo = arg->gtGetOp1();
1359 GenTree* argHi = arg->gtGetOp2();
1360
1361 GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList(argLo, 0, TYP_INT, nullptr);
1362 // Only the first fieldList node (GTF_FIELD_LIST_HEAD) is in the instruction sequence.
1363 (void)new (comp, GT_FIELD_LIST) GenTreeFieldList(argHi, 4, TYP_INT, fieldList);
1364 GenTree* putArg = NewPutArg(call, fieldList, info, type);
1365
1366 BlockRange().InsertBefore(arg, putArg);
1367 BlockRange().Remove(arg);
1368 *ppArg = fieldList;
1369 info->node = fieldList;
1370 }
1371 else
1372 {
1373 assert(arg->OperGet() == GT_LONG);
1374 // For longs, we will replace the GT_LONG with a GT_FIELD_LIST, and put that under a PUTARG_STK.
1375 // Although the hi argument needs to be pushed first, that will be handled by the general case,
1376 // in which the fields will be reversed.
1377 assert(info->numSlots == 2);
1378 GenTree* argLo = arg->gtGetOp1();
1379 GenTree* argHi = arg->gtGetOp2();
1380 GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList(argLo, 0, TYP_INT, nullptr);
1381 // Only the first fieldList node (GTF_FIELD_LIST_HEAD) is in the instruction sequence.
1382 (void)new (comp, GT_FIELD_LIST) GenTreeFieldList(argHi, 4, TYP_INT, fieldList);
1383 GenTree* putArg = NewPutArg(call, fieldList, info, type);
1384 putArg->gtRegNum = info->regNum;
1385
1386 // We can't call ReplaceArgWithPutArgOrBitcast here because it presumes that we are keeping the original
1387 // arg.
1388 BlockRange().InsertBefore(arg, fieldList, putArg);
1389 BlockRange().Remove(arg);
1390 *ppArg = putArg;
1391 }
1392 }
1393 else
1394#endif // !defined(_TARGET_64BIT_)
1395 {
1396
1397#ifdef _TARGET_ARMARCH_
1398 if (call->IsVarargs() || comp->opts.compUseSoftFP)
1399 {
1400 // For vararg call or on armel, reg args should be all integer.
1401 // Insert copies as needed to move float value to integer register.
1402 GenTree* newNode = LowerFloatArg(ppArg, info);
1403 if (newNode != nullptr)
1404 {
1405 type = newNode->TypeGet();
1406 }
1407 }
1408#endif // _TARGET_ARMARCH_
1409
1410 GenTree* putArg = NewPutArg(call, arg, info, type);
1411
1412 // In the case of register passable struct (in one or two registers)
1413 // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_FIELD_LIST with two GT_PUTARG_REGs.)
1414 // If an extra node is returned, splice it in the right place in the tree.
1415 if (arg != putArg)
1416 {
1417 ReplaceArgWithPutArgOrBitcast(ppArg, putArg);
1418 }
1419 }
1420}
1421
1422#ifdef _TARGET_ARMARCH_
1423//------------------------------------------------------------------------
1424// LowerFloatArg: Lower float call arguments on the arm platform.
1425//
1426// Arguments:
1427// arg - The arg node
1428// info - call argument info
1429//
1430// Return Value:
1431// Return nullptr, if no transformation was done;
1432// return arg if there was in place transformation;
1433// return a new tree if the root was changed.
1434//
1435// Notes:
1436// This must handle scalar float arguments as well as GT_FIELD_LISTs
1437// with floating point fields.
1438//
1439GenTree* Lowering::LowerFloatArg(GenTree** pArg, fgArgTabEntry* info)
1440{
1441 GenTree* arg = *pArg;
1442 if (info->regNum != REG_STK)
1443 {
1444 if (arg->OperIsFieldList())
1445 {
1446 GenTreeFieldList* currListNode = arg->AsFieldList();
1447 regNumber currRegNumber = info->regNum;
1448
1449 // Transform fields that are passed as registers in place.
1450 unsigned fieldRegCount;
1451 for (unsigned i = 0; i < info->numRegs; i += fieldRegCount)
1452 {
1453 assert(currListNode != nullptr);
1454 GenTree* node = currListNode->Current();
1455 if (varTypeIsFloating(node))
1456 {
1457 GenTree* intNode = LowerFloatArgReg(node, currRegNumber);
1458 assert(intNode != nullptr);
1459
1460 ReplaceArgWithPutArgOrBitcast(currListNode->pCurrent(), intNode);
1461 currListNode->ChangeType(intNode->TypeGet());
1462 }
1463
1464 if (node->TypeGet() == TYP_DOUBLE)
1465 {
1466 currRegNumber = REG_NEXT(REG_NEXT(currRegNumber));
1467 fieldRegCount = 2;
1468 }
1469 else
1470 {
1471 currRegNumber = REG_NEXT(currRegNumber);
1472 fieldRegCount = 1;
1473 }
1474 currListNode = currListNode->Rest();
1475 }
1476 // List fields were replaced in place.
1477 return arg;
1478 }
1479 else if (varTypeIsFloating(arg))
1480 {
1481 GenTree* intNode = LowerFloatArgReg(arg, info->regNum);
1482 assert(intNode != nullptr);
1483 ReplaceArgWithPutArgOrBitcast(pArg, intNode);
1484 return *pArg;
1485 }
1486 }
1487 return nullptr;
1488}
1489
1490//------------------------------------------------------------------------
1491// LowerFloatArgReg: Lower the float call argument node that is passed via register.
1492//
1493// Arguments:
1494// arg - The arg node
1495// regNum - register number
1496//
1497// Return Value:
1498// Return new bitcast node, that moves float to int register.
1499//
1500GenTree* Lowering::LowerFloatArgReg(GenTree* arg, regNumber regNum)
1501{
1502 var_types floatType = arg->TypeGet();
1503 assert(varTypeIsFloating(floatType));
1504 var_types intType = (floatType == TYP_DOUBLE) ? TYP_LONG : TYP_INT;
1505 GenTree* intArg = comp->gtNewBitCastNode(intType, arg);
1506 intArg->gtRegNum = regNum;
1507#ifdef _TARGET_ARM_
1508 if (floatType == TYP_DOUBLE)
1509 {
1510 regNumber nextReg = REG_NEXT(regNum);
1511 intArg->AsMultiRegOp()->gtOtherReg = nextReg;
1512 }
1513#endif
1514 return intArg;
1515}
1516#endif
1517
1518// do lowering steps for each arg of a call
1519void Lowering::LowerArgsForCall(GenTreeCall* call)
1520{
1521 JITDUMP("objp:\n======\n");
1522 if (call->gtCallObjp)
1523 {
1524 LowerArg(call, &call->gtCallObjp);
1525 }
1526
1527 GenTreeArgList* args = call->gtCallArgs;
1528
1529 JITDUMP("\nargs:\n======\n");
1530 for (; args; args = args->Rest())
1531 {
1532 LowerArg(call, &args->Current());
1533 }
1534
1535 JITDUMP("\nlate:\n======\n");
1536 for (args = call->gtCallLateArgs; args; args = args->Rest())
1537 {
1538 LowerArg(call, &args->Current());
1539 }
1540}
1541
1542// helper that create a node representing a relocatable physical address computation
1543GenTree* Lowering::AddrGen(ssize_t addr)
1544{
1545 // this should end up in codegen as : instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, reg, addr)
1546 GenTree* result = comp->gtNewIconHandleNode(addr, GTF_ICON_FTN_ADDR);
1547 return result;
1548}
1549
1550// variant that takes a void*
1551GenTree* Lowering::AddrGen(void* addr)
1552{
1553 return AddrGen((ssize_t)addr);
1554}
1555
1556// do lowering steps for a call
1557// this includes:
1558// - adding the placement nodes (either stack or register variety) for arguments
1559// - lowering the expression that calculates the target address
1560// - adding nodes for other operations that occur after the call sequence starts and before
1561// control transfer occurs (profiling and tail call helpers, pinvoke incantations)
1562//
1563void Lowering::LowerCall(GenTree* node)
1564{
1565 GenTreeCall* call = node->AsCall();
1566
1567 JITDUMP("lowering call (before):\n");
1568 DISPTREERANGE(BlockRange(), call);
1569 JITDUMP("\n");
1570
1571 call->ClearOtherRegs();
1572 LowerArgsForCall(call);
1573
1574 // note that everything generated from this point on runs AFTER the outgoing args are placed
1575 GenTree* controlExpr = nullptr;
1576
1577 // for x86, this is where we record ESP for checking later to make sure stack is balanced
1578
1579 // Check for Delegate.Invoke(). If so, we inline it. We get the
1580 // target-object and target-function from the delegate-object, and do
1581 // an indirect call.
1582 if (call->IsDelegateInvoke())
1583 {
1584 controlExpr = LowerDelegateInvoke(call);
1585 }
1586 else
1587 {
1588 // Virtual and interface calls
1589 switch (call->gtFlags & GTF_CALL_VIRT_KIND_MASK)
1590 {
1591 case GTF_CALL_VIRT_STUB:
1592 controlExpr = LowerVirtualStubCall(call);
1593 break;
1594
1595 case GTF_CALL_VIRT_VTABLE:
1596 // stub dispatching is off or this is not a virtual call (could be a tailcall)
1597 controlExpr = LowerVirtualVtableCall(call);
1598 break;
1599
1600 case GTF_CALL_NONVIRT:
1601 if (call->IsUnmanaged())
1602 {
1603 controlExpr = LowerNonvirtPinvokeCall(call);
1604 }
1605 else if (call->gtCallType == CT_INDIRECT)
1606 {
1607 controlExpr = LowerIndirectNonvirtCall(call);
1608 }
1609 else
1610 {
1611 controlExpr = LowerDirectCall(call);
1612 }
1613 break;
1614
1615 default:
1616 noway_assert(!"strange call type");
1617 break;
1618 }
1619 }
1620
1621 if (call->IsTailCallViaHelper())
1622 {
1623 // Either controlExpr or gtCallAddr must contain real call target.
1624 if (controlExpr == nullptr)
1625 {
1626 assert(call->gtCallType == CT_INDIRECT);
1627 assert(call->gtCallAddr != nullptr);
1628 controlExpr = call->gtCallAddr;
1629 }
1630
1631 controlExpr = LowerTailCallViaHelper(call, controlExpr);
1632 }
1633
1634 if (controlExpr != nullptr)
1635 {
1636 LIR::Range controlExprRange = LIR::SeqTree(comp, controlExpr);
1637
1638 JITDUMP("results of lowering call:\n");
1639 DISPRANGE(controlExprRange);
1640
1641 GenTree* insertionPoint = call;
1642 if (!call->IsTailCallViaHelper())
1643 {
1644 // The controlExpr should go before the gtCallCookie and the gtCallAddr, if they exist
1645 //
1646 // TODO-LIR: find out what's really required here, as this is currently a tree order
1647 // dependency.
1648 if (call->gtCallType == CT_INDIRECT)
1649 {
1650 bool isClosed = false;
1651 if (call->gtCallCookie != nullptr)
1652 {
1653#ifdef DEBUG
1654 GenTree* firstCallAddrNode = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
1655 assert(isClosed);
1656 assert(call->gtCallCookie->Precedes(firstCallAddrNode));
1657#endif // DEBUG
1658
1659 insertionPoint = BlockRange().GetTreeRange(call->gtCallCookie, &isClosed).FirstNode();
1660 assert(isClosed);
1661 }
1662 else if (call->gtCallAddr != nullptr)
1663 {
1664 insertionPoint = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
1665 assert(isClosed);
1666 }
1667 }
1668 }
1669
1670 ContainCheckRange(controlExprRange);
1671 BlockRange().InsertBefore(insertionPoint, std::move(controlExprRange));
1672
1673 call->gtControlExpr = controlExpr;
1674 }
1675 if (call->IsFastTailCall())
1676 {
1677 // Lower fast tail call can introduce new temps to set up args correctly for Callee.
1678 // This involves patching LCL_VAR and LCL_VAR_ADDR nodes holding Caller stack args
1679 // and replacing them with a new temp. Control expr also can contain nodes that need
1680 // to be patched.
1681 // Therefore lower fast tail call must be done after controlExpr is inserted into LIR.
1682 // There is one side effect which is flipping the order of PME and control expression
1683 // since LowerFastTailCall calls InsertPInvokeMethodEpilog.
1684 LowerFastTailCall(call);
1685 }
1686
1687 if (comp->opts.IsJit64Compat())
1688 {
1689 CheckVSQuirkStackPaddingNeeded(call);
1690 }
1691
1692 ContainCheckCallOperands(call);
1693 JITDUMP("lowering call (after):\n");
1694 DISPTREERANGE(BlockRange(), call);
1695 JITDUMP("\n");
1696}
1697
1698// Though the below described issue gets fixed in intellitrace dll of VS2015 (a.k.a Dev14),
1699// we still need this quirk for desktop so that older version of VS (e.g. VS2010/2012)
1700// continues to work.
1701// This quirk is excluded from other targets that have no back compat burden.
1702//
1703// Quirk for VS debug-launch scenario to work:
1704// See if this is a PInvoke call with exactly one param that is the address of a struct local.
1705// In such a case indicate to frame-layout logic to add 16-bytes of padding
1706// between save-reg area and locals. This is to protect against the buffer
1707// overrun bug in microsoft.intellitrace.11.0.0.dll!ProfilerInterop.InitInterop().
1708//
1709// A work-around to this bug is to disable IntelliTrace debugging
1710// (VS->Tools->Options->IntelliTrace->Enable IntelliTrace - uncheck this option).
1711// The reason why this works on Jit64 is that at the point of AV the call stack is
1712//
1713// GetSystemInfo() Native call
1714// IL_Stub generated for PInvoke declaration.
1715// ProfilerInterface::InitInterop()
1716// ProfilerInterface.Cctor()
1717// VM asm worker
1718//
1719// The cctor body has just the call to InitInterop(). VM asm worker is holding
1720// something in rbx that is used immediately after the Cctor call. Jit64 generated
1721// InitInterop() method is pushing the registers in the following order
1722//
1723// rbx
1724// rbp
1725// rsi
1726// rdi
1727// r12
1728// r13
1729// Struct local
1730//
1731// Due to buffer overrun, rbx doesn't get impacted. Whereas RyuJIT jitted code of
1732// the same method is pushing regs in the following order
1733//
1734// rbp
1735// rdi
1736// rsi
1737// rbx
1738// struct local
1739//
1740// Therefore as a fix, we add padding between save-reg area and locals to
1741// make this scenario work against JB.
1742//
1743// Note: If this quirk gets broken due to other JIT optimizations, we should consider
1744// more tolerant fix. One such fix is to padd the struct.
1745void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call)
1746{
1747 assert(comp->opts.IsJit64Compat());
1748
1749#ifdef _TARGET_AMD64_
1750 // Confine this to IL stub calls which aren't marked as unmanaged.
1751 if (call->IsPInvoke() && !call->IsUnmanaged())
1752 {
1753 bool paddingNeeded = false;
1754 GenTree* firstPutArgReg = nullptr;
1755 for (GenTreeArgList* args = call->gtCallLateArgs; args; args = args->Rest())
1756 {
1757 GenTree* tmp = args->Current();
1758 if (tmp->OperGet() == GT_PUTARG_REG)
1759 {
1760 if (firstPutArgReg == nullptr)
1761 {
1762 firstPutArgReg = tmp;
1763 GenTree* op1 = firstPutArgReg->gtOp.gtOp1;
1764
1765 if (op1->OperGet() == GT_LCL_VAR_ADDR)
1766 {
1767 unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
1768 // TODO-1stClassStructs: This is here to duplicate previous behavior,
1769 // but is not needed because the scenario being quirked did not involve
1770 // a SIMD or enregisterable struct.
1771 // if(comp->lvaTable[lclNum].TypeGet() == TYP_STRUCT)
1772 if (varTypeIsStruct(comp->lvaTable[lclNum].TypeGet()))
1773 {
1774 // First arg is addr of a struct local.
1775 paddingNeeded = true;
1776 }
1777 else
1778 {
1779 // Not a struct local.
1780 assert(paddingNeeded == false);
1781 break;
1782 }
1783 }
1784 else
1785 {
1786 // First arg is not a local var addr.
1787 assert(paddingNeeded == false);
1788 break;
1789 }
1790 }
1791 else
1792 {
1793 // Has more than one arg.
1794 paddingNeeded = false;
1795 break;
1796 }
1797 }
1798 }
1799
1800 if (paddingNeeded)
1801 {
1802 comp->compVSQuirkStackPaddingNeeded = VSQUIRK_STACK_PAD;
1803 }
1804 }
1805#endif // _TARGET_AMD64_
1806}
1807
1808// Inserts profiler hook, GT_PROF_HOOK for a tail call node.
1809//
1810// AMD64:
1811// We need to insert this after all nested calls, but before all the arguments to this call have been set up.
1812// To do this, we look for the first GT_PUTARG_STK or GT_PUTARG_REG, and insert the hook immediately before
1813// that. If there are no args, then it should be inserted before the call node.
1814//
1815// For example:
1816// * stmtExpr void (top level) (IL 0x000...0x010)
1817// arg0 SETUP | /--* argPlace ref REG NA $c5
1818// this in rcx | | /--* argPlace ref REG NA $c1
1819// | | | /--* call ref System.Globalization.CultureInfo.get_InvariantCulture $c2
1820// arg1 SETUP | | +--* st.lclVar ref V02 tmp1 REG NA $c2
1821// | | | /--* lclVar ref V02 tmp1 u : 2 (last use) REG NA $c2
1822// arg1 in rdx | | +--* putarg_reg ref REG NA
1823// | | | /--* lclVar ref V00 arg0 u : 2 (last use) REG NA $80
1824// this in rcx | | +--* putarg_reg ref REG NA
1825// | | /--* call nullcheck ref System.String.ToLower $c5
1826// | | { * stmtExpr void (embedded)(IL 0x000... ? ? ? )
1827// | | { \--* prof_hook void REG NA
1828// arg0 in rcx | +--* putarg_reg ref REG NA
1829// control expr | +--* const(h) long 0x7ffe8e910e98 ftn REG NA
1830// \--* call void System.Runtime.Remoting.Identity.RemoveAppNameOrAppGuidIfNecessary $VN.Void
1831//
1832// In this case, the GT_PUTARG_REG src is a nested call. We need to put the instructions after that call
1833// (as shown). We assume that of all the GT_PUTARG_*, only the first one can have a nested call.
1834//
1835// X86:
1836// Insert the profiler hook immediately before the call. The profiler hook will preserve
1837// all argument registers (ECX, EDX), but nothing else.
1838//
1839// Params:
1840// callNode - tail call node
1841// insertionPoint - if non-null, insert the profiler hook before this point.
1842// If null, insert the profiler hook before args are setup
1843// but after all arg side effects are computed.
1844//
1845void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint)
1846{
1847 assert(call->IsTailCall());
1848 assert(comp->compIsProfilerHookNeeded());
1849
1850#if defined(_TARGET_X86_)
1851
1852 if (insertionPoint == nullptr)
1853 {
1854 insertionPoint = call;
1855 }
1856
1857#else // !defined(_TARGET_X86_)
1858
1859 if (insertionPoint == nullptr)
1860 {
1861 GenTree* tmp = nullptr;
1862 for (GenTreeArgList* args = call->gtCallArgs; args; args = args->Rest())
1863 {
1864 tmp = args->Current();
1865 assert(tmp->OperGet() != GT_PUTARG_REG); // We don't expect to see these in gtCallArgs
1866 if (tmp->OperGet() == GT_PUTARG_STK)
1867 {
1868 // found it
1869 insertionPoint = tmp;
1870 break;
1871 }
1872 }
1873
1874 if (insertionPoint == nullptr)
1875 {
1876 for (GenTreeArgList* args = call->gtCallLateArgs; args; args = args->Rest())
1877 {
1878 tmp = args->Current();
1879 if ((tmp->OperGet() == GT_PUTARG_REG) || (tmp->OperGet() == GT_PUTARG_STK))
1880 {
1881 // found it
1882 insertionPoint = tmp;
1883 break;
1884 }
1885 }
1886
1887 // If there are no args, insert before the call node
1888 if (insertionPoint == nullptr)
1889 {
1890 insertionPoint = call;
1891 }
1892 }
1893 }
1894
1895#endif // !defined(_TARGET_X86_)
1896
1897 assert(insertionPoint != nullptr);
1898 GenTree* profHookNode = new (comp, GT_PROF_HOOK) GenTree(GT_PROF_HOOK, TYP_VOID);
1899 BlockRange().InsertBefore(insertionPoint, profHookNode);
1900}
1901
1902// Lower fast tail call implemented as epilog+jmp.
1903// Also inserts PInvoke method epilog if required.
1904void Lowering::LowerFastTailCall(GenTreeCall* call)
1905{
1906#if FEATURE_FASTTAILCALL
1907 // Tail call restrictions i.e. conditions under which tail prefix is ignored.
1908 // Most of these checks are already done by importer or fgMorphTailCall().
1909 // This serves as a double sanity check.
1910 assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
1911 assert(!comp->opts.compNeedSecurityCheck); // tail call from methods that need security check
1912 assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
1913 assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
1914
1915#ifdef _TARGET_AMD64_
1916 assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
1917#endif // _TARGET_AMD64_
1918
1919 // We expect to see a call that meets the following conditions
1920 assert(call->IsFastTailCall());
1921
1922 // VM cannot use return address hijacking when A() and B() tail call each
1923 // other in mutual recursion. Therefore, this block is reachable through
1924 // a GC-safe point or the whole method is marked as fully interruptible.
1925 //
1926 // TODO-Cleanup:
1927 // optReachWithoutCall() depends on the fact that loop headers blocks
1928 // will have a block number > fgLastBB. These loop headers gets added
1929 // after dominator computation and get skipped by OptReachWithoutCall().
1930 // The below condition cannot be asserted in lower because fgSimpleLowering()
1931 // can add a new basic block for range check failure which becomes
1932 // fgLastBB with block number > loop header block number.
1933 // assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) ||
1934 // !comp->optReachWithoutCall(comp->fgFirstBB, comp->compCurBB) || comp->genInterruptible);
1935
1936 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
1937 // a method returns. This is a case of caller method has both PInvokes and tail calls.
1938 if (comp->info.compCallUnmanaged)
1939 {
1940 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
1941 }
1942
1943 // Args for tail call are setup in incoming arg area. The gc-ness of args of
1944 // caller and callee (which being tail called) may not match. Therefore, everything
1945 // from arg setup until the epilog need to be non-interuptible by GC. This is
1946 // achieved by inserting GT_START_NONGC before the very first GT_PUTARG_STK node
1947 // of call is setup. Note that once a stack arg is setup, it cannot have nested
1948 // calls subsequently in execution order to setup other args, because the nested
1949 // call could over-write the stack arg that is setup earlier.
1950 GenTree* firstPutArgStk = nullptr;
1951 GenTreeArgList* args;
1952 ArrayStack<GenTree*> putargs(comp->getAllocator(CMK_ArrayStack));
1953
1954 for (args = call->gtCallArgs; args; args = args->Rest())
1955 {
1956 GenTree* tmp = args->Current();
1957 if (tmp->OperGet() == GT_PUTARG_STK)
1958 {
1959 putargs.Push(tmp);
1960 }
1961 }
1962
1963 for (args = call->gtCallLateArgs; args; args = args->Rest())
1964 {
1965 GenTree* tmp = args->Current();
1966 if (tmp->OperGet() == GT_PUTARG_STK)
1967 {
1968 putargs.Push(tmp);
1969 }
1970 }
1971
1972 if (!putargs.Empty())
1973 {
1974 firstPutArgStk = putargs.Bottom();
1975 }
1976
1977 // If we have a putarg_stk node, also count the number of non-standard args the
1978 // call node has. Note that while determining whether a tail call can be fast
1979 // tail called, we don't count non-standard args (passed in R10 or R11) since they
1980 // don't contribute to outgoing arg space. These non-standard args are not
1981 // accounted in caller's arg count but accounted in callee's arg count after
1982 // fgMorphArgs(). Therefore, exclude callee's non-standard args while mapping
1983 // callee's stack arg num to corresponding caller's stack arg num.
1984 unsigned calleeNonStandardArgCount = call->GetNonStandardAddedArgCount(comp);
1985
1986 // Say Caller(a, b, c, d, e) fast tail calls Callee(e, d, c, b, a)
1987 // i.e. passes its arguments in reverse to Callee. During call site
1988 // setup, after computing argument side effects, stack args are setup
1989 // first and reg args next. In the above example, both Callers and
1990 // Callee stack args (e and a respectively) share the same stack slot
1991 // and are alive at the same time. The act of setting up Callee's
1992 // stack arg will over-write the stack arg of Caller and if there are
1993 // further uses of Caller stack arg we have to make sure that we move
1994 // it to a temp before over-writing its slot and use temp in place of
1995 // the corresponding Caller stack arg.
1996 //
1997 // For the above example, conceptually this is what is done
1998 // tmp = e;
1999 // Stack slot of e = a
2000 // R9 = b, R8 = c, RDx = d
2001 // RCX = tmp
2002 //
2003 // The below logic is meant to detect cases like this and introduce
2004 // temps to set up args correctly for Callee.
2005
2006 for (int i = 0; i < putargs.Height(); i++)
2007 {
2008 GenTree* putArgStkNode = putargs.Bottom(i);
2009
2010 assert(putArgStkNode->OperGet() == GT_PUTARG_STK);
2011
2012 // Get the caller arg num corresponding to this callee arg.
2013 // Note that these two args share the same stack slot. Therefore,
2014 // if there are further uses of corresponding caller arg, we need
2015 // to move it to a temp and use the temp in this call tree.
2016 //
2017 // Note that Caller is guaranteed to have a param corresponding to
2018 // this Callee's arg since fast tail call mechanism counts the
2019 // stack slots required for both Caller and Callee for passing params
2020 // and allow fast tail call only if stack slots required by Caller >=
2021 // Callee.
2022 fgArgTabEntry* argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode);
2023 assert(argTabEntry);
2024 unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount;
2025 noway_assert(callerArgNum < comp->info.compArgsCount);
2026
2027 unsigned callerArgLclNum = callerArgNum;
2028 LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum;
2029 if (callerArgDsc->lvPromoted)
2030 {
2031 callerArgLclNum =
2032 callerArgDsc->lvFieldLclStart; // update the callerArgNum to the promoted struct field's lclNum
2033 callerArgDsc = comp->lvaTable + callerArgLclNum;
2034 }
2035 noway_assert(callerArgDsc->lvIsParam);
2036
2037 // Start searching in execution order list till we encounter call node
2038 unsigned tmpLclNum = BAD_VAR_NUM;
2039 var_types tmpType = TYP_UNDEF;
2040 for (GenTree* treeNode = putArgStkNode->gtNext; treeNode != call; treeNode = treeNode->gtNext)
2041 {
2042 if (treeNode->OperIsLocal() || treeNode->OperIsLocalAddr())
2043 {
2044 // This should not be a GT_PHI_ARG.
2045 assert(treeNode->OperGet() != GT_PHI_ARG);
2046
2047 GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon();
2048 LclVarDsc* lclVar = &comp->lvaTable[lcl->gtLclNum];
2049
2050 // Fast tail calling criteria permits passing of structs of size 1, 2, 4 and 8 as args.
2051 // It is possible that the callerArgLclNum corresponds to such a struct whose stack slot
2052 // is getting over-written by setting up of a stack arg and there are further uses of
2053 // any of its fields if such a struct is type-dependently promoted. In this case too
2054 // we need to introduce a temp.
2055 if ((lcl->gtLclNum == callerArgNum) || (lcl->gtLclNum == callerArgLclNum))
2056 {
2057 // Create tmp and use it in place of callerArgDsc
2058 if (tmpLclNum == BAD_VAR_NUM)
2059 {
2060 // Set tmpType first before calling lvaGrabTemp, as that call invalidates callerArgDsc
2061 tmpType = genActualType(callerArgDsc->lvaArgType());
2062 tmpLclNum = comp->lvaGrabTemp(
2063 true DEBUGARG("Fast tail call lowering is creating a new local variable"));
2064
2065 comp->lvaTable[tmpLclNum].lvType = tmpType;
2066 comp->lvaTable[tmpLclNum].lvDoNotEnregister = comp->lvaTable[lcl->gtLclNum].lvDoNotEnregister;
2067 }
2068
2069 lcl->SetLclNum(tmpLclNum);
2070 }
2071 }
2072 }
2073
2074 // If we have created a temp, insert an embedded assignment stmnt before
2075 // the first putargStkNode i.e.
2076 // tmpLcl = CallerArg
2077 if (tmpLclNum != BAD_VAR_NUM)
2078 {
2079 assert(tmpType != TYP_UNDEF);
2080 GenTreeLclVar* local =
2081 new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, tmpType, callerArgLclNum, BAD_IL_OFFSET);
2082 GenTree* assignExpr = comp->gtNewTempAssign(tmpLclNum, local);
2083 ContainCheckRange(local, assignExpr);
2084 BlockRange().InsertBefore(firstPutArgStk, LIR::SeqTree(comp, assignExpr));
2085 }
2086 }
2087
2088 // Insert GT_START_NONGC node before the first GT_PUTARG_STK node.
2089 // Note that if there are no args to be setup on stack, no need to
2090 // insert GT_START_NONGC node.
2091 GenTree* startNonGCNode = nullptr;
2092 if (firstPutArgStk != nullptr)
2093 {
2094 startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID);
2095 BlockRange().InsertBefore(firstPutArgStk, startNonGCNode);
2096
2097 // Gc-interruptability in the following case:
2098 // foo(a, b, c, d, e) { bar(a, b, c, d, e); }
2099 // bar(a, b, c, d, e) { foo(a, b, d, d, e); }
2100 //
2101 // Since the instruction group starting from the instruction that sets up first
2102 // stack arg to the end of the tail call is marked as non-gc interruptible,
2103 // this will form a non-interruptible tight loop causing gc-starvation. To fix
2104 // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method
2105 // has a single basic block and is not a GC-safe point. The presence of a single
2106 // nop outside non-gc interruptible region will prevent gc starvation.
2107 if ((comp->fgBBcount == 1) && !(comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT))
2108 {
2109 assert(comp->fgFirstBB == comp->compCurBB);
2110 GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID);
2111 BlockRange().InsertBefore(startNonGCNode, noOp);
2112 }
2113 }
2114
2115 // Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be
2116 // inserted before the args are setup but after the side effects of args are
2117 // computed. That is, GT_PROF_HOOK node needs to be inserted before GT_START_NONGC
2118 // node if one exists.
2119 if (comp->compIsProfilerHookNeeded())
2120 {
2121 InsertProfTailCallHook(call, startNonGCNode);
2122 }
2123
2124#else // !FEATURE_FASTTAILCALL
2125
2126 // Platform choose not to implement fast tail call mechanism.
2127 // In such a case we should never be reaching this method as
2128 // the expectation is that IsTailCallViaHelper() will always
2129 // be true on such a platform.
2130 unreached();
2131#endif
2132}
2133
2134//------------------------------------------------------------------------
2135// LowerTailCallViaHelper: lower a call via the tailcall helper. Morph
2136// has already inserted tailcall helper special arguments. This function
2137// inserts actual data for some placeholders.
2138//
2139// For ARM32, AMD64, lower
2140// tail.call(void* copyRoutine, void* dummyArg, ...)
2141// as
2142// Jit_TailCall(void* copyRoutine, void* callTarget, ...)
2143//
2144// For x86, lower
2145// tail.call(<function args>, int numberOfOldStackArgs, int dummyNumberOfNewStackArgs, int flags, void* dummyArg)
2146// as
2147// JIT_TailCall(<function args>, int numberOfOldStackArgsWords, int numberOfNewStackArgsWords, int flags, void*
2148// callTarget)
2149// Note that the special arguments are on the stack, whereas the function arguments follow the normal convention.
2150//
2151// Also inserts PInvoke method epilog if required.
2152//
2153// Arguments:
2154// call - The call node
2155// callTarget - The real call target. This is used to replace the dummyArg during lowering.
2156//
2157// Return Value:
2158// Returns control expression tree for making a call to helper Jit_TailCall.
2159//
2160GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget)
2161{
2162 // Tail call restrictions i.e. conditions under which tail prefix is ignored.
2163 // Most of these checks are already done by importer or fgMorphTailCall().
2164 // This serves as a double sanity check.
2165 assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
2166 assert(!comp->opts.compNeedSecurityCheck); // tail call from methods that need security check
2167 assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
2168 assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
2169
2170#ifdef _TARGET_AMD64_
2171 assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
2172#endif // _TARGET_AMD64_
2173
2174 // We expect to see a call that meets the following conditions
2175 assert(call->IsTailCallViaHelper());
2176 assert(callTarget != nullptr);
2177
2178 // The TailCall helper call never returns to the caller and is not GC interruptible.
2179 // Therefore the block containing the tail call should be a GC safe point to avoid
2180 // GC starvation. It is legal for the block to be unmarked iff the entry block is a
2181 // GC safe point, as the entry block trivially dominates every reachable block.
2182 assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) || (comp->fgFirstBB->bbFlags & BBF_GC_SAFE_POINT));
2183
2184 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
2185 // a method returns. This is a case of caller method has both PInvokes and tail calls.
2186 if (comp->info.compCallUnmanaged)
2187 {
2188 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
2189 }
2190
2191 // Remove gtCallAddr from execution order if present.
2192 if (call->gtCallType == CT_INDIRECT)
2193 {
2194 assert(call->gtCallAddr != nullptr);
2195
2196 bool isClosed;
2197 LIR::ReadOnlyRange callAddrRange = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed);
2198 assert(isClosed);
2199
2200 BlockRange().Remove(std::move(callAddrRange));
2201 }
2202
2203 // The callTarget tree needs to be sequenced.
2204 LIR::Range callTargetRange = LIR::SeqTree(comp, callTarget);
2205
2206#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM_)
2207
2208 // For ARM32 and AMD64, first argument is CopyRoutine and second argument is a place holder node.
2209 fgArgTabEntry* argEntry;
2210
2211#ifdef DEBUG
2212 argEntry = comp->gtArgEntryByArgNum(call, 0);
2213 assert(argEntry != nullptr);
2214 assert(argEntry->node->gtOper == GT_PUTARG_REG);
2215 GenTree* firstArg = argEntry->node->gtOp.gtOp1;
2216 assert(firstArg->gtOper == GT_CNS_INT);
2217#endif
2218
2219 // Replace second arg by callTarget.
2220 argEntry = comp->gtArgEntryByArgNum(call, 1);
2221 assert(argEntry != nullptr);
2222 assert(argEntry->node->gtOper == GT_PUTARG_REG);
2223 GenTree* secondArg = argEntry->node->gtOp.gtOp1;
2224
2225 ContainCheckRange(callTargetRange);
2226 BlockRange().InsertAfter(secondArg, std::move(callTargetRange));
2227
2228 bool isClosed;
2229 LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(secondArg, &isClosed);
2230 assert(isClosed);
2231
2232 BlockRange().Remove(std::move(secondArgRange));
2233
2234 argEntry->node->gtOp.gtOp1 = callTarget;
2235
2236#elif defined(_TARGET_X86_)
2237
2238 // Verify the special args are what we expect, and replace the dummy args with real values.
2239 // We need to figure out the size of the outgoing stack arguments, not including the special args.
2240 // The number of 4-byte words is passed to the helper for the incoming and outgoing argument sizes.
2241 // This number is exactly the next slot number in the call's argument info struct.
2242 unsigned nNewStkArgsWords = call->fgArgInfo->GetNextSlotNum();
2243 assert(nNewStkArgsWords >= 4); // There must be at least the four special stack args.
2244 nNewStkArgsWords -= 4;
2245
2246 unsigned numArgs = call->fgArgInfo->ArgCount();
2247
2248 fgArgTabEntry* argEntry;
2249
2250 // arg 0 == callTarget.
2251 argEntry = comp->gtArgEntryByArgNum(call, numArgs - 1);
2252 assert(argEntry != nullptr);
2253 assert(argEntry->node->gtOper == GT_PUTARG_STK);
2254 GenTree* arg0 = argEntry->node->gtOp.gtOp1;
2255
2256 ContainCheckRange(callTargetRange);
2257 BlockRange().InsertAfter(arg0, std::move(callTargetRange));
2258
2259 bool isClosed;
2260 LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(arg0, &isClosed);
2261 assert(isClosed);
2262 BlockRange().Remove(std::move(secondArgRange));
2263
2264 argEntry->node->gtOp.gtOp1 = callTarget;
2265
2266 // arg 1 == flags
2267 argEntry = comp->gtArgEntryByArgNum(call, numArgs - 2);
2268 assert(argEntry != nullptr);
2269 assert(argEntry->node->gtOper == GT_PUTARG_STK);
2270 GenTree* arg1 = argEntry->node->gtOp.gtOp1;
2271 assert(arg1->gtOper == GT_CNS_INT);
2272
2273 ssize_t tailCallHelperFlags = 1 | // always restore EDI,ESI,EBX
2274 (call->IsVirtualStub() ? 0x2 : 0x0); // Stub dispatch flag
2275 arg1->gtIntCon.gtIconVal = tailCallHelperFlags;
2276
2277 // arg 2 == numberOfNewStackArgsWords
2278 argEntry = comp->gtArgEntryByArgNum(call, numArgs - 3);
2279 assert(argEntry != nullptr);
2280 assert(argEntry->node->gtOper == GT_PUTARG_STK);
2281 GenTree* arg2 = argEntry->node->gtOp.gtOp1;
2282 assert(arg2->gtOper == GT_CNS_INT);
2283
2284 arg2->gtIntCon.gtIconVal = nNewStkArgsWords;
2285
2286#ifdef DEBUG
2287 // arg 3 == numberOfOldStackArgsWords
2288 argEntry = comp->gtArgEntryByArgNum(call, numArgs - 4);
2289 assert(argEntry != nullptr);
2290 assert(argEntry->node->gtOper == GT_PUTARG_STK);
2291 GenTree* arg3 = argEntry->node->gtOp.gtOp1;
2292 assert(arg3->gtOper == GT_CNS_INT);
2293#endif // DEBUG
2294
2295#else
2296 NYI("LowerTailCallViaHelper");
2297#endif // _TARGET_*
2298
2299 // Transform this call node into a call to Jit tail call helper.
2300 call->gtCallType = CT_HELPER;
2301 call->gtCallMethHnd = comp->eeFindHelper(CORINFO_HELP_TAILCALL);
2302 call->gtFlags &= ~GTF_CALL_VIRT_KIND_MASK;
2303
2304 // Lower this as if it were a pure helper call.
2305 call->gtCallMoreFlags &= ~(GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER);
2306 GenTree* result = LowerDirectCall(call);
2307
2308 // Now add back tail call flags for identifying this node as tail call dispatched via helper.
2309 call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER;
2310
2311#ifdef PROFILING_SUPPORTED
2312 // Insert profiler tail call hook if needed.
2313 // Since we don't know the insertion point, pass null for second param.
2314 if (comp->compIsProfilerHookNeeded())
2315 {
2316 InsertProfTailCallHook(call, nullptr);
2317 }
2318#endif // PROFILING_SUPPORTED
2319
2320 assert(call->IsTailCallViaHelper());
2321
2322 return result;
2323}
2324
2325#ifndef _TARGET_64BIT_
2326//------------------------------------------------------------------------
2327// Lowering::DecomposeLongCompare: Decomposes a TYP_LONG compare node.
2328//
2329// Arguments:
2330// cmp - the compare node
2331//
2332// Return Value:
2333// The next node to lower.
2334//
2335// Notes:
2336// This is done during lowering because DecomposeLongs handles only nodes
2337// that produce TYP_LONG values. Compare nodes may consume TYP_LONG values
2338// but produce TYP_INT values.
2339//
2340GenTree* Lowering::DecomposeLongCompare(GenTree* cmp)
2341{
2342 assert(cmp->gtGetOp1()->TypeGet() == TYP_LONG);
2343
2344 GenTree* src1 = cmp->gtGetOp1();
2345 GenTree* src2 = cmp->gtGetOp2();
2346 assert(src1->OperIs(GT_LONG));
2347 assert(src2->OperIs(GT_LONG));
2348 GenTree* loSrc1 = src1->gtGetOp1();
2349 GenTree* hiSrc1 = src1->gtGetOp2();
2350 GenTree* loSrc2 = src2->gtGetOp1();
2351 GenTree* hiSrc2 = src2->gtGetOp2();
2352 BlockRange().Remove(src1);
2353 BlockRange().Remove(src2);
2354
2355 genTreeOps condition = cmp->OperGet();
2356 GenTree* loCmp;
2357 GenTree* hiCmp;
2358
2359 if (cmp->OperIs(GT_EQ, GT_NE))
2360 {
2361 //
2362 // Transform (x EQ|NE y) into (((x.lo XOR y.lo) OR (x.hi XOR y.hi)) EQ|NE 0). If y is 0 then this can
2363 // be reduced to just ((x.lo OR x.hi) EQ|NE 0). The OR is expected to set the condition flags so we
2364 // don't need to generate a redundant compare against 0, we only generate a SETCC|JCC instruction.
2365 //
2366 // XOR is used rather than SUB because it is commutative and thus allows swapping the operands when
2367 // the first happens to be a constant. Usually only the second compare operand is a constant but it's
2368 // still possible to have a constant on the left side. For example, when src1 is a uint->ulong cast
2369 // then hiSrc1 would be 0.
2370 //
2371
2372 if (loSrc1->OperIs(GT_CNS_INT))
2373 {
2374 std::swap(loSrc1, loSrc2);
2375 }
2376
2377 if (loSrc2->IsIntegralConst(0))
2378 {
2379 BlockRange().Remove(loSrc2);
2380 loCmp = loSrc1;
2381 }
2382 else
2383 {
2384 loCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, loSrc1, loSrc2);
2385 BlockRange().InsertBefore(cmp, loCmp);
2386 ContainCheckBinary(loCmp->AsOp());
2387 }
2388
2389 if (hiSrc1->OperIs(GT_CNS_INT))
2390 {
2391 std::swap(hiSrc1, hiSrc2);
2392 }
2393
2394 if (hiSrc2->IsIntegralConst(0))
2395 {
2396 BlockRange().Remove(hiSrc2);
2397 hiCmp = hiSrc1;
2398 }
2399 else
2400 {
2401 hiCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, hiSrc1, hiSrc2);
2402 BlockRange().InsertBefore(cmp, hiCmp);
2403 ContainCheckBinary(hiCmp->AsOp());
2404 }
2405
2406 hiCmp = comp->gtNewOperNode(GT_OR, TYP_INT, loCmp, hiCmp);
2407 BlockRange().InsertBefore(cmp, hiCmp);
2408 ContainCheckBinary(hiCmp->AsOp());
2409 }
2410 else
2411 {
2412 assert(cmp->OperIs(GT_LT, GT_LE, GT_GE, GT_GT));
2413
2414 //
2415 // If the compare is signed then (x LT|GE y) can be transformed into ((x SUB y) LT|GE 0).
2416 // If the compare is unsigned we can still use SUB but we need to check the Carry flag,
2417 // not the actual result. In both cases we can simply check the appropiate condition flags
2418 // and ignore the actual result:
2419 // SUB_LO loSrc1, loSrc2
2420 // SUB_HI hiSrc1, hiSrc2
2421 // SETCC|JCC (signed|unsigned LT|GE)
2422 // If loSrc2 happens to be 0 then the first SUB can be eliminated and the second one can
2423 // be turned into a CMP because the first SUB would have set carry to 0. This effectively
2424 // transforms a long compare against 0 into an int compare of the high part against 0.
2425 //
2426 // (x LE|GT y) can to be transformed into ((x SUB y) LE|GT 0) but checking that a long value
2427 // is greater than 0 is not so easy. We need to turn this into a positive/negative check
2428 // like the one we get for LT|GE compares, this can be achieved by swapping the compare:
2429 // (x LE|GT y) becomes (y GE|LT x)
2430 //
2431 // Having to swap operands is problematic when the second operand is a constant. The constant
2432 // moves to the first operand where it cannot be contained and thus needs a register. This can
2433 // be avoided by changing the constant such that LE|GT becomes LT|GE:
2434 // (x LE|GT 41) becomes (x LT|GE 42)
2435 //
2436
2437 if (cmp->OperIs(GT_LE, GT_GT))
2438 {
2439 bool mustSwap = true;
2440
2441 if (loSrc2->OperIs(GT_CNS_INT) && hiSrc2->OperIs(GT_CNS_INT))
2442 {
2443 uint32_t loValue = static_cast<uint32_t>(loSrc2->AsIntCon()->IconValue());
2444 uint32_t hiValue = static_cast<uint32_t>(hiSrc2->AsIntCon()->IconValue());
2445 uint64_t value = static_cast<uint64_t>(loValue) | (static_cast<uint64_t>(hiValue) << 32);
2446 uint64_t maxValue = cmp->IsUnsigned() ? UINT64_MAX : INT64_MAX;
2447
2448 if (value != maxValue)
2449 {
2450 value++;
2451 loValue = value & UINT32_MAX;
2452 hiValue = (value >> 32) & UINT32_MAX;
2453 loSrc2->AsIntCon()->SetIconValue(loValue);
2454 hiSrc2->AsIntCon()->SetIconValue(hiValue);
2455
2456 condition = cmp->OperIs(GT_LE) ? GT_LT : GT_GE;
2457 mustSwap = false;
2458 }
2459 }
2460
2461 if (mustSwap)
2462 {
2463 std::swap(loSrc1, loSrc2);
2464 std::swap(hiSrc1, hiSrc2);
2465 condition = GenTree::SwapRelop(condition);
2466 }
2467 }
2468
2469 assert((condition == GT_LT) || (condition == GT_GE));
2470
2471 if (loSrc2->IsIntegralConst(0))
2472 {
2473 BlockRange().Remove(loSrc2);
2474
2475 // Very conservative dead code removal... but it helps.
2476
2477 if (loSrc1->OperIs(GT_CNS_INT, GT_LCL_VAR, GT_LCL_FLD))
2478 {
2479 BlockRange().Remove(loSrc1);
2480 }
2481 else
2482 {
2483 loSrc1->SetUnusedValue();
2484 }
2485
2486 hiCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, hiSrc1, hiSrc2);
2487 BlockRange().InsertBefore(cmp, hiCmp);
2488 ContainCheckCompare(hiCmp->AsOp());
2489 }
2490 else
2491 {
2492 loCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, loSrc1, loSrc2);
2493 hiCmp = comp->gtNewOperNode(GT_SUB_HI, TYP_INT, hiSrc1, hiSrc2);
2494 BlockRange().InsertBefore(cmp, loCmp, hiCmp);
2495 ContainCheckCompare(loCmp->AsOp());
2496 ContainCheckBinary(hiCmp->AsOp());
2497
2498 //
2499 // Try to move the first SUB_HI operands right in front of it, this allows using
2500 // a single temporary register instead of 2 (one for CMP and one for SUB_HI). Do
2501 // this only for locals as they won't change condition flags. Note that we could
2502 // move constants (except 0 which generates XOR reg, reg) but it's extremly rare
2503 // to have a constant as the first operand.
2504 //
2505
2506 if (hiSrc1->OperIs(GT_LCL_VAR, GT_LCL_FLD))
2507 {
2508 BlockRange().Remove(hiSrc1);
2509 BlockRange().InsertBefore(hiCmp, hiSrc1);
2510 }
2511 }
2512 }
2513
2514 hiCmp->gtFlags |= GTF_SET_FLAGS;
2515 if (hiCmp->IsValue())
2516 {
2517 hiCmp->SetUnusedValue();
2518 }
2519
2520 LIR::Use cmpUse;
2521 if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE))
2522 {
2523 BlockRange().Remove(cmp);
2524
2525 GenTree* jcc = cmpUse.User();
2526 jcc->gtOp.gtOp1 = nullptr;
2527 jcc->ChangeOper(GT_JCC);
2528 jcc->gtFlags |= (cmp->gtFlags & GTF_UNSIGNED) | GTF_USE_FLAGS;
2529 jcc->AsCC()->gtCondition = condition;
2530 }
2531 else
2532 {
2533 cmp->gtOp.gtOp1 = nullptr;
2534 cmp->gtOp.gtOp2 = nullptr;
2535 cmp->ChangeOper(GT_SETCC);
2536 cmp->gtFlags |= GTF_USE_FLAGS;
2537 cmp->AsCC()->gtCondition = condition;
2538 }
2539
2540 return cmp->gtNext;
2541}
2542#endif // !_TARGET_64BIT_
2543
2544//------------------------------------------------------------------------
2545// Lowering::OptimizeConstCompare: Performs various "compare with const" optimizations.
2546//
2547// Arguments:
2548// cmp - the compare node
2549//
2550// Return Value:
2551// The original compare node if lowering should proceed as usual or the next node
2552// to lower if the compare node was changed in such a way that lowering is no
2553// longer needed.
2554//
2555// Notes:
2556// - Narrow operands to enable memory operand containment (XARCH specific).
2557// - Transform cmp(and(x, y), 0) into test(x, y) (XARCH/Arm64 specific but could
2558// be used for ARM as well if support for GT_TEST_EQ/GT_TEST_NE is added).
2559// - Transform TEST(x, LSH(1, y)) into BT(x, y) (XARCH specific)
2560// - Transform RELOP(OP, 0) into SETCC(OP) or JCC(OP) if OP can set the
2561// condition flags appropriately (XARCH/ARM64 specific but could be extended
2562// to ARM32 as well if ARM32 codegen supports GTF_SET_FLAGS).
2563//
2564GenTree* Lowering::OptimizeConstCompare(GenTree* cmp)
2565{
2566 assert(cmp->gtGetOp2()->IsIntegralConst());
2567
2568#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
2569 GenTree* op1 = cmp->gtGetOp1();
2570 var_types op1Type = op1->TypeGet();
2571 GenTreeIntCon* op2 = cmp->gtGetOp2()->AsIntCon();
2572 ssize_t op2Value = op2->IconValue();
2573
2574#ifdef _TARGET_XARCH_
2575 if (IsContainableMemoryOp(op1) && varTypeIsSmall(op1Type) && genSmallTypeCanRepresentValue(op1Type, op2Value))
2576 {
2577 //
2578 // If op1's type is small then try to narrow op2 so it has the same type as op1.
2579 // Small types are usually used by memory loads and if both compare operands have
2580 // the same type then the memory load can be contained. In certain situations
2581 // (e.g "cmp ubyte, 200") we also get a smaller instruction encoding.
2582 //
2583
2584 op2->gtType = op1Type;
2585 }
2586 else
2587#endif
2588 if (op1->OperIs(GT_CAST) && !op1->gtOverflow())
2589 {
2590 GenTreeCast* cast = op1->AsCast();
2591 var_types castToType = cast->CastToType();
2592 GenTree* castOp = cast->gtGetOp1();
2593
2594 if (((castToType == TYP_BOOL) || (castToType == TYP_UBYTE)) && FitsIn<UINT8>(op2Value))
2595 {
2596 //
2597 // Since we're going to remove the cast we need to be able to narrow the cast operand
2598 // to the cast type. This can be done safely only for certain opers (e.g AND, OR, XOR).
2599 // Some opers just can't be narrowed (e.g DIV, MUL) while other could be narrowed but
2600 // doing so would produce incorrect results (e.g. RSZ, RSH).
2601 //
2602 // The below list of handled opers is conservative but enough to handle the most common
2603 // situations. In particular this include CALL, sometimes the JIT unnecessarilly widens
2604 // the result of bool returning calls.
2605 //
2606 bool removeCast =
2607#ifdef _TARGET_ARM64_
2608 (op2Value == 0) && cmp->OperIs(GT_EQ, GT_NE, GT_GT) &&
2609#endif
2610 (castOp->OperIs(GT_CALL, GT_LCL_VAR) || castOp->OperIsLogical()
2611#ifdef _TARGET_XARCH_
2612 || IsContainableMemoryOp(castOp)
2613#endif
2614 );
2615
2616 if (removeCast)
2617 {
2618 assert(!castOp->gtOverflowEx()); // Must not be an overflow checking operation
2619
2620#ifdef _TARGET_ARM64_
2621 bool cmpEq = cmp->OperIs(GT_EQ);
2622
2623 cmp->SetOperRaw(cmpEq ? GT_TEST_EQ : GT_TEST_NE);
2624 op2->SetIconValue(0xff);
2625 op2->gtType = castOp->gtType;
2626#else
2627 castOp->gtType = castToType;
2628 op2->gtType = castToType;
2629#endif
2630 // If we have any contained memory ops on castOp, they must now not be contained.
2631 if (castOp->OperIsLogical())
2632 {
2633 GenTree* op1 = castOp->gtGetOp1();
2634 if ((op1 != nullptr) && !op1->IsCnsIntOrI())
2635 {
2636 op1->ClearContained();
2637 }
2638 GenTree* op2 = castOp->gtGetOp2();
2639 if ((op2 != nullptr) && !op2->IsCnsIntOrI())
2640 {
2641 op2->ClearContained();
2642 }
2643 }
2644 cmp->gtOp.gtOp1 = castOp;
2645
2646 BlockRange().Remove(cast);
2647 }
2648 }
2649 }
2650 else if (op1->OperIs(GT_AND) && cmp->OperIs(GT_EQ, GT_NE))
2651 {
2652 //
2653 // Transform ((x AND y) EQ|NE 0) into (x TEST_EQ|TEST_NE y) when possible.
2654 //
2655
2656 GenTree* andOp1 = op1->gtGetOp1();
2657 GenTree* andOp2 = op1->gtGetOp2();
2658
2659 if (op2Value != 0)
2660 {
2661 //
2662 // If we don't have a 0 compare we can get one by transforming ((x AND mask) EQ|NE mask)
2663 // into ((x AND mask) NE|EQ 0) when mask is a single bit.
2664 //
2665
2666 if (isPow2(static_cast<size_t>(op2Value)) && andOp2->IsIntegralConst(op2Value))
2667 {
2668 op2Value = 0;
2669 op2->SetIconValue(0);
2670 cmp->SetOperRaw(GenTree::ReverseRelop(cmp->OperGet()));
2671 }
2672 }
2673
2674 if (op2Value == 0)
2675 {
2676 BlockRange().Remove(op1);
2677 BlockRange().Remove(op2);
2678
2679 cmp->SetOperRaw(cmp->OperIs(GT_EQ) ? GT_TEST_EQ : GT_TEST_NE);
2680 cmp->gtOp.gtOp1 = andOp1;
2681 cmp->gtOp.gtOp2 = andOp2;
2682 // We will re-evaluate containment below
2683 andOp1->ClearContained();
2684 andOp2->ClearContained();
2685
2686#ifdef _TARGET_XARCH_
2687 if (IsContainableMemoryOp(andOp1) && andOp2->IsIntegralConst())
2688 {
2689 //
2690 // For "test" we only care about the bits that are set in the second operand (mask).
2691 // If the mask fits in a small type then we can narrow both operands to generate a "test"
2692 // instruction with a smaller encoding ("test" does not have a r/m32, imm8 form) and avoid
2693 // a widening load in some cases.
2694 //
2695 // For 16 bit operands we narrow only if the memory operand is already 16 bit. This matches
2696 // the behavior of a previous implementation and avoids adding more cases where we generate
2697 // 16 bit instructions that require a length changing prefix (0x66). These suffer from
2698 // significant decoder stalls on Intel CPUs.
2699 //
2700 // We could also do this for 64 bit masks that fit into 32 bit but it doesn't help.
2701 // In such cases morph narrows down the existing GT_AND by inserting a cast between it and
2702 // the memory operand so we'd need to add more code to recognize and eliminate that cast.
2703 //
2704
2705 size_t mask = static_cast<size_t>(andOp2->AsIntCon()->IconValue());
2706
2707 if (FitsIn<UINT8>(mask))
2708 {
2709 andOp1->gtType = TYP_UBYTE;
2710 andOp2->gtType = TYP_UBYTE;
2711 }
2712 else if (FitsIn<UINT16>(mask) && genTypeSize(andOp1) == 2)
2713 {
2714 andOp1->gtType = TYP_USHORT;
2715 andOp2->gtType = TYP_USHORT;
2716 }
2717 }
2718#endif
2719 }
2720 }
2721
2722 if (cmp->OperIs(GT_TEST_EQ, GT_TEST_NE))
2723 {
2724#ifdef _TARGET_XARCH_
2725 //
2726 // Transform TEST_EQ|NE(x, LSH(1, y)) into BT(x, y) when possible. Using BT
2727 // results in smaller and faster code. It also doesn't have special register
2728 // requirements, unlike LSH that requires the shift count to be in ECX.
2729 // Note that BT has the same behavior as LSH when the bit index exceeds the
2730 // operand bit size - it uses (bit_index MOD bit_size).
2731 //
2732
2733 GenTree* lsh = cmp->gtGetOp2();
2734 LIR::Use cmpUse;
2735
2736 if (lsh->OperIs(GT_LSH) && varTypeIsIntOrI(lsh->TypeGet()) && lsh->gtGetOp1()->IsIntegralConst(1) &&
2737 BlockRange().TryGetUse(cmp, &cmpUse))
2738 {
2739 genTreeOps condition = cmp->OperIs(GT_TEST_NE) ? GT_LT : GT_GE;
2740
2741 cmp->SetOper(GT_BT);
2742 cmp->gtType = TYP_VOID;
2743 cmp->gtFlags |= GTF_SET_FLAGS;
2744 cmp->gtOp.gtOp2 = lsh->gtGetOp2();
2745 cmp->gtGetOp2()->ClearContained();
2746
2747 BlockRange().Remove(lsh->gtGetOp1());
2748 BlockRange().Remove(lsh);
2749
2750 GenTreeCC* cc;
2751
2752 if (cmpUse.User()->OperIs(GT_JTRUE))
2753 {
2754 cmpUse.User()->ChangeOper(GT_JCC);
2755 cc = cmpUse.User()->AsCC();
2756 cc->gtCondition = condition;
2757 }
2758 else
2759 {
2760 cc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, TYP_INT);
2761 BlockRange().InsertAfter(cmp, cc);
2762 cmpUse.ReplaceWith(comp, cc);
2763 }
2764
2765 cc->gtFlags |= GTF_USE_FLAGS | GTF_UNSIGNED;
2766
2767 return cmp->gtNext;
2768 }
2769#endif // _TARGET_XARCH_
2770 }
2771 else if (cmp->OperIs(GT_EQ, GT_NE))
2772 {
2773 GenTree* op1 = cmp->gtGetOp1();
2774 GenTree* op2 = cmp->gtGetOp2();
2775
2776 // TODO-CQ: right now the below peep is inexpensive and gets the benefit in most
2777 // cases because in majority of cases op1, op2 and cmp would be in that order in
2778 // execution. In general we should be able to check that all the nodes that come
2779 // after op1 do not modify the flags so that it is safe to avoid generating a
2780 // test instruction.
2781
2782 if (op2->IsIntegralConst(0) && (op1->gtNext == op2) && (op2->gtNext == cmp) &&
2783#ifdef _TARGET_XARCH_
2784 op1->OperIs(GT_AND, GT_OR, GT_XOR, GT_ADD, GT_SUB, GT_NEG))
2785#else // _TARGET_ARM64_
2786 op1->OperIs(GT_AND, GT_ADD, GT_SUB))
2787#endif
2788 {
2789 op1->gtFlags |= GTF_SET_FLAGS;
2790 op1->SetUnusedValue();
2791
2792 BlockRange().Remove(op2);
2793
2794 GenTree* next = cmp->gtNext;
2795 GenTree* cc;
2796 genTreeOps ccOp;
2797 LIR::Use cmpUse;
2798
2799 // Fast check for the common case - relop used by a JTRUE that immediately follows it.
2800 if ((next != nullptr) && next->OperIs(GT_JTRUE) && (next->gtGetOp1() == cmp))
2801 {
2802 cc = next;
2803 ccOp = GT_JCC;
2804 next = nullptr;
2805 BlockRange().Remove(cmp);
2806 }
2807 else if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE))
2808 {
2809 cc = cmpUse.User();
2810 ccOp = GT_JCC;
2811 next = nullptr;
2812 BlockRange().Remove(cmp);
2813 }
2814 else // The relop is not used by a JTRUE or it is not used at all.
2815 {
2816 // Transform the relop node it into a SETCC. If it's not used we could remove
2817 // it completely but that means doing more work to handle a rare case.
2818 cc = cmp;
2819 ccOp = GT_SETCC;
2820 }
2821
2822 genTreeOps condition = cmp->OperGet();
2823 cc->ChangeOper(ccOp);
2824 cc->AsCC()->gtCondition = condition;
2825 cc->gtFlags |= GTF_USE_FLAGS | (cmp->gtFlags & GTF_UNSIGNED);
2826
2827 return next;
2828 }
2829 }
2830#endif // defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
2831
2832 return cmp;
2833}
2834
2835//------------------------------------------------------------------------
2836// Lowering::LowerCompare: Lowers a compare node.
2837//
2838// Arguments:
2839// cmp - the compare node
2840//
2841// Return Value:
2842// The next node to lower.
2843//
2844GenTree* Lowering::LowerCompare(GenTree* cmp)
2845{
2846#ifndef _TARGET_64BIT_
2847 if (cmp->gtGetOp1()->TypeGet() == TYP_LONG)
2848 {
2849 return DecomposeLongCompare(cmp);
2850 }
2851#endif
2852
2853 if (cmp->gtGetOp2()->IsIntegralConst() && !comp->opts.MinOpts())
2854 {
2855 GenTree* next = OptimizeConstCompare(cmp);
2856
2857 // If OptimizeConstCompare return the compare node as "next" then we need to continue lowering.
2858 if (next != cmp)
2859 {
2860 return next;
2861 }
2862 }
2863
2864#ifdef _TARGET_XARCH_
2865 if (cmp->gtGetOp1()->TypeGet() == cmp->gtGetOp2()->TypeGet())
2866 {
2867 if (varTypeIsSmall(cmp->gtGetOp1()->TypeGet()) && varTypeIsUnsigned(cmp->gtGetOp1()->TypeGet()))
2868 {
2869 //
2870 // If both operands have the same type then codegen will use the common operand type to
2871 // determine the instruction type. For small types this would result in performing a
2872 // signed comparison of two small unsigned values without zero extending them to TYP_INT
2873 // which is incorrect. Note that making the comparison unsigned doesn't imply that codegen
2874 // has to generate a small comparison, it can still correctly generate a TYP_INT comparison.
2875 //
2876
2877 cmp->gtFlags |= GTF_UNSIGNED;
2878 }
2879 }
2880#endif // _TARGET_XARCH_
2881 ContainCheckCompare(cmp->AsOp());
2882 return cmp->gtNext;
2883}
2884
2885//------------------------------------------------------------------------
2886// Lowering::LowerJTrue: Lowers a JTRUE node.
2887//
2888// Arguments:
2889// jtrue - the JTRUE node
2890//
2891// Return Value:
2892// The next node to lower (usually nullptr).
2893//
2894// Notes:
2895// On ARM64 this may remove the JTRUE node and transform its associated
2896// relop into a JCMP node.
2897//
2898GenTree* Lowering::LowerJTrue(GenTreeOp* jtrue)
2899{
2900#ifdef _TARGET_ARM64_
2901 GenTree* relop = jtrue->gtGetOp1();
2902 GenTree* relopOp2 = relop->gtOp.gtGetOp2();
2903
2904 if ((relop->gtNext == jtrue) && relopOp2->IsCnsIntOrI())
2905 {
2906 bool useJCMP = false;
2907 unsigned flags = 0;
2908
2909 if (relop->OperIs(GT_EQ, GT_NE) && relopOp2->IsIntegralConst(0))
2910 {
2911 // Codegen will use cbz or cbnz in codegen which do not affect the flag register
2912 flags = relop->OperIs(GT_EQ) ? GTF_JCMP_EQ : 0;
2913 useJCMP = true;
2914 }
2915 else if (relop->OperIs(GT_TEST_EQ, GT_TEST_NE) && isPow2(relopOp2->AsIntCon()->IconValue()))
2916 {
2917 // Codegen will use tbz or tbnz in codegen which do not affect the flag register
2918 flags = GTF_JCMP_TST | (relop->OperIs(GT_TEST_EQ) ? GTF_JCMP_EQ : 0);
2919 useJCMP = true;
2920 }
2921
2922 if (useJCMP)
2923 {
2924 relop->SetOper(GT_JCMP);
2925 relop->gtFlags &= ~(GTF_JCMP_TST | GTF_JCMP_EQ);
2926 relop->gtFlags |= flags;
2927 relop->gtType = TYP_VOID;
2928
2929 relopOp2->SetContained();
2930
2931 BlockRange().Remove(jtrue);
2932
2933 assert(relop->gtNext == nullptr);
2934 return nullptr;
2935 }
2936 }
2937#endif // _TARGET_ARM64_
2938
2939 ContainCheckJTrue(jtrue);
2940
2941 assert(jtrue->gtNext == nullptr);
2942 return nullptr;
2943}
2944
2945// Lower "jmp <method>" tail call to insert PInvoke method epilog if required.
2946void Lowering::LowerJmpMethod(GenTree* jmp)
2947{
2948 assert(jmp->OperGet() == GT_JMP);
2949
2950 JITDUMP("lowering GT_JMP\n");
2951 DISPNODE(jmp);
2952 JITDUMP("============");
2953
2954 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
2955 // a method returns.
2956 if (comp->info.compCallUnmanaged)
2957 {
2958 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(jmp));
2959 }
2960}
2961
2962// Lower GT_RETURN node to insert PInvoke method epilog if required.
2963void Lowering::LowerRet(GenTree* ret)
2964{
2965 assert(ret->OperGet() == GT_RETURN);
2966
2967 JITDUMP("lowering GT_RETURN\n");
2968 DISPNODE(ret);
2969 JITDUMP("============");
2970
2971#if defined(_TARGET_AMD64_) && defined(FEATURE_SIMD)
2972 GenTreeUnOp* const unOp = ret->AsUnOp();
2973 if ((unOp->TypeGet() == TYP_LONG) && (unOp->gtOp1->TypeGet() == TYP_SIMD8))
2974 {
2975 GenTreeUnOp* bitcast = new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, TYP_LONG, unOp->gtOp1, nullptr);
2976 unOp->gtOp1 = bitcast;
2977 BlockRange().InsertBefore(unOp, bitcast);
2978 }
2979#endif // _TARGET_AMD64_
2980
2981 // Method doing PInvokes has exactly one return block unless it has tail calls.
2982 if (comp->info.compCallUnmanaged && (comp->compCurBB == comp->genReturnBB))
2983 {
2984 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(ret));
2985 }
2986 ContainCheckRet(ret->AsOp());
2987}
2988
2989GenTree* Lowering::LowerDirectCall(GenTreeCall* call)
2990{
2991 noway_assert(call->gtCallType == CT_USER_FUNC || call->gtCallType == CT_HELPER);
2992
2993 // Don't support tail calling helper methods.
2994 // But we might encounter tail calls dispatched via JIT helper appear as a tail call to helper.
2995 noway_assert(!call->IsTailCall() || call->IsTailCallViaHelper() || call->gtCallType == CT_USER_FUNC);
2996
2997 // Non-virtual direct/indirect calls: Work out if the address of the
2998 // call is known at JIT time. If not it is either an indirect call
2999 // or the address must be accessed via an single/double indirection.
3000
3001 void* addr;
3002 InfoAccessType accessType;
3003 CorInfoHelpFunc helperNum = comp->eeGetHelperNum(call->gtCallMethHnd);
3004
3005#ifdef FEATURE_READYTORUN_COMPILER
3006 if (call->gtEntryPoint.addr != nullptr)
3007 {
3008 accessType = call->gtEntryPoint.accessType;
3009 addr = call->gtEntryPoint.addr;
3010 }
3011 else
3012#endif
3013 if (call->gtCallType == CT_HELPER)
3014 {
3015 noway_assert(helperNum != CORINFO_HELP_UNDEF);
3016
3017 // the convention on getHelperFtn seems to be (it's not documented)
3018 // that it returns an address or if it returns null, pAddr is set to
3019 // another address, which requires an indirection
3020 void* pAddr;
3021 addr = comp->info.compCompHnd->getHelperFtn(helperNum, (void**)&pAddr);
3022
3023 if (addr != nullptr)
3024 {
3025 assert(pAddr == nullptr);
3026 accessType = IAT_VALUE;
3027 }
3028 else
3029 {
3030 accessType = IAT_PVALUE;
3031 addr = pAddr;
3032 }
3033 }
3034 else
3035 {
3036 noway_assert(helperNum == CORINFO_HELP_UNDEF);
3037
3038 CORINFO_ACCESS_FLAGS aflags = CORINFO_ACCESS_ANY;
3039
3040 if (call->IsSameThis())
3041 {
3042 aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_THIS);
3043 }
3044
3045 if (!call->NeedsNullCheck())
3046 {
3047 aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_NONNULL);
3048 }
3049
3050 CORINFO_CONST_LOOKUP addrInfo;
3051 comp->info.compCompHnd->getFunctionEntryPoint(call->gtCallMethHnd, &addrInfo, aflags);
3052
3053 accessType = addrInfo.accessType;
3054 addr = addrInfo.addr;
3055 }
3056
3057 GenTree* result = nullptr;
3058 switch (accessType)
3059 {
3060 case IAT_VALUE:
3061 // Non-virtual direct call to known address
3062 if (!IsCallTargetInRange(addr) || call->IsTailCall())
3063 {
3064 result = AddrGen(addr);
3065 }
3066 else
3067 {
3068 // a direct call within range of hardware relative call instruction
3069 // stash the address for codegen
3070 call->gtDirectCallAddress = addr;
3071 }
3072 break;
3073
3074 case IAT_PVALUE:
3075 {
3076 // Non-virtual direct calls to addresses accessed by
3077 // a single indirection.
3078 GenTree* cellAddr = AddrGen(addr);
3079 GenTree* indir = Ind(cellAddr);
3080 result = indir;
3081 break;
3082 }
3083
3084 case IAT_PPVALUE:
3085 // Non-virtual direct calls to addresses accessed by
3086 // a double indirection.
3087 //
3088 // Double-indirection. Load the address into a register
3089 // and call indirectly through the register
3090 noway_assert(helperNum == CORINFO_HELP_UNDEF);
3091 result = AddrGen(addr);
3092 result = Ind(Ind(result));
3093 break;
3094
3095 case IAT_RELPVALUE:
3096 {
3097 // Non-virtual direct calls to addresses accessed by
3098 // a single relative indirection.
3099 GenTree* cellAddr = AddrGen(addr);
3100 GenTree* indir = Ind(cellAddr);
3101 result = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, indir, AddrGen(addr));
3102 break;
3103 }
3104
3105 default:
3106 noway_assert(!"Bad accessType");
3107 break;
3108 }
3109
3110 return result;
3111}
3112
3113GenTree* Lowering::LowerDelegateInvoke(GenTreeCall* call)
3114{
3115 noway_assert(call->gtCallType == CT_USER_FUNC);
3116
3117 assert((comp->info.compCompHnd->getMethodAttribs(call->gtCallMethHnd) &
3118 (CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL)) == (CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL));
3119
3120 GenTree* thisArgNode;
3121 if (call->IsTailCallViaHelper())
3122 {
3123#ifdef _TARGET_X86_ // x86 tailcall via helper follows normal calling convention, but with extra stack args.
3124 const unsigned argNum = 0;
3125#else // !_TARGET_X86_
3126 // In case of helper dispatched tail calls, "thisptr" will be the third arg.
3127 // The first two args are: real call target and addr of args copy routine.
3128 const unsigned argNum = 2;
3129#endif // !_TARGET_X86_
3130
3131 fgArgTabEntry* thisArgTabEntry = comp->gtArgEntryByArgNum(call, argNum);
3132 thisArgNode = thisArgTabEntry->node;
3133 }
3134 else
3135 {
3136 thisArgNode = comp->gtGetThisArg(call);
3137 }
3138
3139 assert(thisArgNode->gtOper == GT_PUTARG_REG);
3140 GenTree* originalThisExpr = thisArgNode->gtOp.gtOp1;
3141 GenTree* thisExpr = originalThisExpr;
3142
3143 // We're going to use the 'this' expression multiple times, so make a local to copy it.
3144
3145 unsigned lclNum;
3146
3147#ifdef _TARGET_X86_
3148 if (call->IsTailCallViaHelper() && originalThisExpr->IsLocal())
3149 {
3150 // For ordering purposes for the special tailcall arguments on x86, we forced the
3151 // 'this' pointer in this case to a local in Compiler::fgMorphTailCall().
3152 // We could possibly use this case to remove copies for all architectures and non-tailcall
3153 // calls by creating a new lcl var or lcl field reference, as is done in the
3154 // LowerVirtualVtableCall() code.
3155 assert(originalThisExpr->OperGet() == GT_LCL_VAR);
3156 lclNum = originalThisExpr->AsLclVarCommon()->GetLclNum();
3157 }
3158 else
3159#endif // _TARGET_X86_
3160 {
3161 unsigned delegateInvokeTmp = comp->lvaGrabTemp(true DEBUGARG("delegate invoke call"));
3162
3163 LIR::Use thisExprUse(BlockRange(), &thisArgNode->gtOp.gtOp1, thisArgNode);
3164 ReplaceWithLclVar(thisExprUse, delegateInvokeTmp);
3165
3166 thisExpr = thisExprUse.Def(); // it's changed; reload it.
3167 lclNum = delegateInvokeTmp;
3168 }
3169
3170 // replace original expression feeding into thisPtr with
3171 // [originalThis + offsetOfDelegateInstance]
3172
3173 GenTree* newThisAddr = new (comp, GT_LEA)
3174 GenTreeAddrMode(TYP_BYREF, thisExpr, nullptr, 0, comp->eeGetEEInfo()->offsetOfDelegateInstance);
3175
3176 GenTree* newThis = comp->gtNewOperNode(GT_IND, TYP_REF, newThisAddr);
3177
3178 BlockRange().InsertAfter(thisExpr, newThisAddr, newThis);
3179
3180 thisArgNode->gtOp.gtOp1 = newThis;
3181 ContainCheckIndir(newThis->AsIndir());
3182
3183 // the control target is
3184 // [originalThis + firstTgtOffs]
3185
3186 GenTree* base = new (comp, GT_LCL_VAR) GenTreeLclVar(originalThisExpr->TypeGet(), lclNum, BAD_IL_OFFSET);
3187
3188 unsigned targetOffs = comp->eeGetEEInfo()->offsetOfDelegateFirstTarget;
3189 GenTree* result = new (comp, GT_LEA) GenTreeAddrMode(TYP_REF, base, nullptr, 0, targetOffs);
3190 GenTree* callTarget = Ind(result);
3191
3192 // don't need to sequence and insert this tree, caller will do it
3193
3194 return callTarget;
3195}
3196
3197GenTree* Lowering::LowerIndirectNonvirtCall(GenTreeCall* call)
3198{
3199#ifdef _TARGET_X86_
3200 if (call->gtCallCookie != nullptr)
3201 {
3202 NYI_X86("Morphing indirect non-virtual call with non-standard args");
3203 }
3204#endif
3205
3206 // Indirect cookie calls gets transformed by fgMorphArgs as indirect call with non-standard args.
3207 // Hence we should never see this type of call in lower.
3208
3209 noway_assert(call->gtCallCookie == nullptr);
3210
3211 return nullptr;
3212}
3213
3214//------------------------------------------------------------------------
3215// CreateReturnTrapSeq: Create a tree to perform a "return trap", used in PInvoke
3216// epilogs to invoke a GC under a condition. The return trap checks some global
3217// location (the runtime tells us where that is and how many indirections to make),
3218// then, based on the result, conditionally calls a GC helper. We use a special node
3219// for this because at this time (late in the compilation phases), introducing flow
3220// is tedious/difficult.
3221//
3222// This is used for PInvoke inlining.
3223//
3224// Return Value:
3225// Code tree to perform the action.
3226//
3227GenTree* Lowering::CreateReturnTrapSeq()
3228{
3229 // The GT_RETURNTRAP node expands to this:
3230 // if (g_TrapReturningThreads)
3231 // {
3232 // RareDisablePreemptiveGC();
3233 // }
3234
3235 // The only thing to do here is build up the expression that evaluates 'g_TrapReturningThreads'.
3236
3237 void* pAddrOfCaptureThreadGlobal = nullptr;
3238 LONG* addrOfCaptureThreadGlobal = comp->info.compCompHnd->getAddrOfCaptureThreadGlobal(&pAddrOfCaptureThreadGlobal);
3239
3240 GenTree* testTree;
3241 if (addrOfCaptureThreadGlobal != nullptr)
3242 {
3243 testTree = Ind(AddrGen(addrOfCaptureThreadGlobal));
3244 }
3245 else
3246 {
3247 testTree = Ind(Ind(AddrGen(pAddrOfCaptureThreadGlobal)));
3248 }
3249 return comp->gtNewOperNode(GT_RETURNTRAP, TYP_INT, testTree);
3250}
3251
3252//------------------------------------------------------------------------
3253// SetGCState: Create a tree that stores the given constant (0 or 1) into the
3254// thread's GC state field.
3255//
3256// This is used for PInvoke inlining.
3257//
3258// Arguments:
3259// state - constant (0 or 1) to store into the thread's GC state field.
3260//
3261// Return Value:
3262// Code tree to perform the action.
3263//
3264GenTree* Lowering::SetGCState(int state)
3265{
3266 // Thread.offsetOfGcState = 0/1
3267
3268 assert(state == 0 || state == 1);
3269
3270 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
3271
3272 GenTree* base = new (comp, GT_LCL_VAR) GenTreeLclVar(TYP_I_IMPL, comp->info.compLvFrameListRoot, -1);
3273
3274 GenTree* stateNode = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_BYTE, state);
3275 GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, base, nullptr, 1, pInfo->offsetOfGCState);
3276 GenTree* storeGcState = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_BYTE, addr, stateNode);
3277 return storeGcState;
3278}
3279
3280//------------------------------------------------------------------------
3281// CreateFrameLinkUpdate: Create a tree that either links or unlinks the
3282// locally-allocated InlinedCallFrame from the Frame list.
3283//
3284// This is used for PInvoke inlining.
3285//
3286// Arguments:
3287// action - whether to link (push) or unlink (pop) the Frame
3288//
3289// Return Value:
3290// Code tree to perform the action.
3291//
3292GenTree* Lowering::CreateFrameLinkUpdate(FrameLinkAction action)
3293{
3294 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
3295 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
3296
3297 GenTree* TCB = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot,
3298 (IL_OFFSET)-1); // cast to resolve ambiguity.
3299
3300 // Thread->m_pFrame
3301 GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, TCB, nullptr, 1, pInfo->offsetOfThreadFrame);
3302
3303 GenTree* data = nullptr;
3304
3305 if (action == PushFrame)
3306 {
3307 // Thread->m_pFrame = &inlinedCallFrame;
3308 data = new (comp, GT_LCL_FLD_ADDR)
3309 GenTreeLclFld(GT_LCL_FLD_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
3310 }
3311 else
3312 {
3313 assert(action == PopFrame);
3314 // Thread->m_pFrame = inlinedCallFrame.m_pNext;
3315
3316 data = new (comp, GT_LCL_FLD) GenTreeLclFld(GT_LCL_FLD, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar,
3317 pInfo->inlinedCallFrameInfo.offsetOfFrameLink);
3318 }
3319 GenTree* storeInd = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_I_IMPL, addr, data);
3320 return storeInd;
3321}
3322
3323//------------------------------------------------------------------------
3324// InsertPInvokeMethodProlog: Create the code that runs at the start of
3325// every method that has PInvoke calls.
3326//
3327// Initialize the TCB local and the InlinedCallFrame object. Then link ("push")
3328// the InlinedCallFrame object on the Frame chain. The layout of InlinedCallFrame
3329// is defined in vm/frames.h. See also vm/jitinterface.cpp for more information.
3330// The offsets of these fields is returned by the VM in a call to ICorStaticInfo::getEEInfo().
3331//
3332// The (current) layout is as follows:
3333//
3334// 64-bit 32-bit CORINFO_EE_INFO
3335// offset offset field name offset when set
3336// -----------------------------------------------------------------------------------------
3337// +00h +00h GS cookie offsetOfGSCookie
3338// +08h +04h vptr for class InlinedCallFrame offsetOfFrameVptr method prolog
3339// +10h +08h m_Next offsetOfFrameLink method prolog
3340// +18h +0Ch m_Datum offsetOfCallTarget call site
3341// +20h n/a m_StubSecretArg not set by JIT
3342// +28h +10h m_pCallSiteSP offsetOfCallSiteSP x86: call site, and zeroed in method
3343// prolog;
3344// non-x86: method prolog (SP remains
3345// constant in function, after prolog: no
3346// localloc and PInvoke in same function)
3347// +30h +14h m_pCallerReturnAddress offsetOfReturnAddress call site
3348// +38h +18h m_pCalleeSavedFP offsetOfCalleeSavedFP not set by JIT
3349// +1Ch JIT retval spill area (int) before call_gc ???
3350// +20h JIT retval spill area (long) before call_gc ???
3351// +24h Saved value of EBP method prolog ???
3352//
3353// Note that in the VM, InlinedCallFrame is a C++ class whose objects have a 'this' pointer that points
3354// to the InlinedCallFrame vptr (the 2nd field listed above), and the GS cookie is stored *before*
3355// the object. When we link the InlinedCallFrame onto the Frame chain, we must point at this location,
3356// and not at the beginning of the InlinedCallFrame local, which is actually the GS cookie.
3357//
3358// Return Value:
3359// none
3360//
3361void Lowering::InsertPInvokeMethodProlog()
3362{
3363 noway_assert(comp->info.compCallUnmanaged);
3364 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
3365
3366 if (comp->opts.ShouldUsePInvokeHelpers())
3367 {
3368 return;
3369 }
3370
3371 JITDUMP("======= Inserting PInvoke method prolog\n");
3372
3373 // The first BB must be a scratch BB in order for us to be able to safely insert the P/Invoke prolog.
3374 assert(comp->fgFirstBBisScratch());
3375
3376 LIR::Range& firstBlockRange = LIR::AsRange(comp->fgFirstBB);
3377
3378 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
3379 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
3380
3381 // First arg: &compiler->lvaInlinedPInvokeFrameVar + callFrameInfo.offsetOfFrameVptr
3382
3383 GenTree* frameAddr = new (comp, GT_LCL_FLD_ADDR)
3384 GenTreeLclFld(GT_LCL_FLD_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
3385
3386 // Call runtime helper to fill in our InlinedCallFrame and push it on the Frame list:
3387 // TCB = CORINFO_HELP_INIT_PINVOKE_FRAME(&symFrameStart, secretArg);
3388 // for x86, don't pass the secretArg.
3389 CLANG_FORMAT_COMMENT_ANCHOR;
3390
3391#if defined(_TARGET_X86_) || defined(_TARGET_ARM_)
3392 GenTreeArgList* argList = comp->gtNewArgList(frameAddr);
3393#else
3394 GenTreeArgList* argList = comp->gtNewArgList(frameAddr, PhysReg(REG_SECRET_STUB_PARAM));
3395#endif
3396
3397 GenTree* call = comp->gtNewHelperCallNode(CORINFO_HELP_INIT_PINVOKE_FRAME, TYP_I_IMPL, argList);
3398
3399 // some sanity checks on the frame list root vardsc
3400 LclVarDsc* varDsc = &comp->lvaTable[comp->info.compLvFrameListRoot];
3401 noway_assert(!varDsc->lvIsParam);
3402 noway_assert(varDsc->lvType == TYP_I_IMPL);
3403
3404 GenTree* store =
3405 new (comp, GT_STORE_LCL_VAR) GenTreeLclVar(GT_STORE_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot,
3406 (IL_OFFSET)-1); // cast to resolve ambiguity.
3407 store->gtOp.gtOp1 = call;
3408 store->gtFlags |= GTF_VAR_DEF;
3409
3410 GenTree* const insertionPoint = firstBlockRange.FirstNonPhiOrCatchArgNode();
3411
3412 comp->fgMorphTree(store);
3413 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, store));
3414 DISPTREERANGE(firstBlockRange, store);
3415
3416#if !defined(_TARGET_X86_) && !defined(_TARGET_ARM_)
3417 // For x86, this step is done at the call site (due to stack pointer not being static in the function).
3418 // For arm32, CallSiteSP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
3419
3420 // --------------------------------------------------------
3421 // InlinedCallFrame.m_pCallSiteSP = @RSP;
3422
3423 GenTreeLclFld* storeSP = new (comp, GT_STORE_LCL_FLD)
3424 GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfCallSiteSP);
3425 storeSP->gtOp1 = PhysReg(REG_SPBASE);
3426 storeSP->gtFlags |= GTF_VAR_DEF;
3427
3428 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeSP));
3429 DISPTREERANGE(firstBlockRange, storeSP);
3430
3431#endif // !defined(_TARGET_X86_) && !defined(_TARGET_ARM_)
3432
3433#if !defined(_TARGET_ARM_)
3434 // For arm32, CalleeSavedFP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
3435
3436 // --------------------------------------------------------
3437 // InlinedCallFrame.m_pCalleeSavedEBP = @RBP;
3438
3439 GenTreeLclFld* storeFP =
3440 new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
3441 callFrameInfo.offsetOfCalleeSavedFP);
3442 storeFP->gtOp1 = PhysReg(REG_FPBASE);
3443 storeFP->gtFlags |= GTF_VAR_DEF;
3444
3445 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeFP));
3446 DISPTREERANGE(firstBlockRange, storeFP);
3447#endif // !defined(_TARGET_ARM_)
3448
3449 // --------------------------------------------------------
3450 // On 32-bit targets, CORINFO_HELP_INIT_PINVOKE_FRAME initializes the PInvoke frame and then pushes it onto
3451 // the current thread's Frame stack. On 64-bit targets, it only initializes the PInvoke frame.
3452 CLANG_FORMAT_COMMENT_ANCHOR;
3453
3454#ifdef _TARGET_64BIT_
3455 if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
3456 {
3457 // Push a frame - if we are NOT in an IL stub, this is done right before the call
3458 // The init routine sets InlinedCallFrame's m_pNext, so we just set the thead's top-of-stack
3459 GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
3460 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
3461 ContainCheckStoreIndir(frameUpd->AsIndir());
3462 DISPTREERANGE(firstBlockRange, frameUpd);
3463 }
3464#endif // _TARGET_64BIT_
3465}
3466
3467//------------------------------------------------------------------------
3468// InsertPInvokeMethodEpilog: Code that needs to be run when exiting any method
3469// that has PInvoke inlines. This needs to be inserted any place you can exit the
3470// function: returns, tailcalls and jmps.
3471//
3472// Arguments:
3473// returnBB - basic block from which a method can return
3474// lastExpr - GenTree of the last top level stmnt of returnBB (debug only arg)
3475//
3476// Return Value:
3477// Code tree to perform the action.
3478//
3479void Lowering::InsertPInvokeMethodEpilog(BasicBlock* returnBB DEBUGARG(GenTree* lastExpr))
3480{
3481 assert(returnBB != nullptr);
3482 assert(comp->info.compCallUnmanaged);
3483
3484 if (comp->opts.ShouldUsePInvokeHelpers())
3485 {
3486 return;
3487 }
3488
3489 JITDUMP("======= Inserting PInvoke method epilog\n");
3490
3491 // Method doing PInvoke calls has exactly one return block unless it has "jmp" or tail calls.
3492 assert(((returnBB == comp->genReturnBB) && (returnBB->bbJumpKind == BBJ_RETURN)) ||
3493 returnBB->endsWithTailCallOrJmp(comp));
3494
3495 LIR::Range& returnBlockRange = LIR::AsRange(returnBB);
3496
3497 GenTree* insertionPoint = returnBlockRange.LastNode();
3498 assert(insertionPoint == lastExpr);
3499
3500 // Note: PInvoke Method Epilog (PME) needs to be inserted just before GT_RETURN, GT_JMP or GT_CALL node in execution
3501 // order so that it is guaranteed that there will be no further PInvokes after that point in the method.
3502 //
3503 // Example1: GT_RETURN(op1) - say execution order is: Op1, GT_RETURN. After inserting PME, execution order would be
3504 // Op1, PME, GT_RETURN
3505 //
3506 // Example2: GT_CALL(arg side effect computing nodes, Stk Args Setup, Reg Args setup). The execution order would be
3507 // arg side effect computing nodes, Stk Args setup, Reg Args setup, GT_CALL
3508 // After inserting PME execution order would be:
3509 // arg side effect computing nodes, Stk Args setup, Reg Args setup, PME, GT_CALL
3510 //
3511 // Example3: GT_JMP. After inserting PME execution order would be: PME, GT_JMP
3512 // That is after PME, args for GT_JMP call will be setup.
3513
3514 // TODO-Cleanup: setting GCState to 1 seems to be redundant as InsertPInvokeCallProlog will set it to zero before a
3515 // PInvoke call and InsertPInvokeCallEpilog() will set it back to 1 after the PInvoke. Though this is redundant,
3516 // it is harmeless.
3517 // Note that liveness is artificially extending the life of compLvFrameListRoot var if the method being compiled has
3518 // PInvokes. Deleting the below stmnt would cause an an assert in lsra.cpp::SetLastUses() since compLvFrameListRoot
3519 // will be live-in to a BBJ_RETURN block without any uses. Long term we need to fix liveness for x64 case to
3520 // properly extend the life of compLvFrameListRoot var.
3521 //
3522 // Thread.offsetOfGcState = 0/1
3523 // That is [tcb + offsetOfGcState] = 1
3524 GenTree* storeGCState = SetGCState(1);
3525 returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeGCState));
3526 ContainCheckStoreIndir(storeGCState->AsIndir());
3527
3528 // Pop the frame if necessary. This always happens in the epilog on 32-bit targets. For 64-bit targets, we only do
3529 // this in the epilog for IL stubs; for non-IL stubs the frame is popped after every PInvoke call.
3530 CLANG_FORMAT_COMMENT_ANCHOR;
3531
3532#ifdef _TARGET_64BIT_
3533 if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
3534#endif // _TARGET_64BIT_
3535 {
3536 GenTree* frameUpd = CreateFrameLinkUpdate(PopFrame);
3537 returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
3538 ContainCheckStoreIndir(frameUpd->AsIndir());
3539 }
3540}
3541
3542//------------------------------------------------------------------------
3543// InsertPInvokeCallProlog: Emit the call-site prolog for direct calls to unmanaged code.
3544// It does all the necessary call-site setup of the InlinedCallFrame.
3545//
3546// Arguments:
3547// call - the call for which we are inserting the PInvoke prolog.
3548//
3549// Return Value:
3550// None.
3551//
3552void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
3553{
3554 JITDUMP("======= Inserting PInvoke call prolog\n");
3555
3556 GenTree* insertBefore = call;
3557 if (call->gtCallType == CT_INDIRECT)
3558 {
3559 bool isClosed;
3560 insertBefore = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
3561 assert(isClosed);
3562 }
3563
3564 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
3565
3566 gtCallTypes callType = (gtCallTypes)call->gtCallType;
3567
3568 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
3569
3570 if (comp->opts.ShouldUsePInvokeHelpers())
3571 {
3572 // First argument is the address of the frame variable.
3573 GenTree* frameAddr = new (comp, GT_LCL_VAR_ADDR)
3574 GenTreeLclVar(GT_LCL_VAR_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET);
3575
3576 // Insert call to CORINFO_HELP_JIT_PINVOKE_BEGIN
3577 GenTree* helperCall =
3578 comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID, comp->gtNewArgList(frameAddr));
3579
3580 comp->fgMorphTree(helperCall);
3581 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, helperCall));
3582 LowerNode(helperCall); // helper call is inserted before current node and should be lowered here.
3583 return;
3584 }
3585
3586 // Emit the following sequence:
3587 //
3588 // InlinedCallFrame.callTarget = methodHandle // stored in m_Datum
3589 // InlinedCallFrame.m_pCallSiteSP = SP // x86 only
3590 // InlinedCallFrame.m_pCallerReturnAddress = return address
3591 // Thread.gcState = 0
3592 // (non-stub) - update top Frame on TCB // 64-bit targets only
3593
3594 // ----------------------------------------------------------------------------------
3595 // Setup InlinedCallFrame.callSiteTarget (which is how the JIT refers to it).
3596 // The actual field is InlinedCallFrame.m_Datum which has many different uses and meanings.
3597
3598 GenTree* src = nullptr;
3599
3600 if (callType == CT_INDIRECT)
3601 {
3602#if !defined(_TARGET_64BIT_)
3603 // On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum.
3604 const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE;
3605
3606 src = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
3607#else
3608 // On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum.
3609 // If the stub parameter value is not needed, m_Datum will be initialized by the VM.
3610 if (comp->info.compPublishStubParam)
3611 {
3612 src = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL);
3613 }
3614#endif // !defined(_TARGET_64BIT_)
3615 }
3616 else
3617 {
3618 assert(callType == CT_USER_FUNC);
3619
3620 void* pEmbedMethodHandle = nullptr;
3621 CORINFO_METHOD_HANDLE embedMethodHandle =
3622 comp->info.compCompHnd->embedMethodHandle(call->gtCallMethHnd, &pEmbedMethodHandle);
3623
3624 noway_assert((!embedMethodHandle) != (!pEmbedMethodHandle));
3625
3626 if (embedMethodHandle != nullptr)
3627 {
3628 // InlinedCallFrame.callSiteTarget = methodHandle
3629 src = AddrGen(embedMethodHandle);
3630 }
3631 else
3632 {
3633 // InlinedCallFrame.callSiteTarget = *pEmbedMethodHandle
3634 src = Ind(AddrGen(pEmbedMethodHandle));
3635 }
3636 }
3637
3638 if (src != nullptr)
3639 {
3640 // Store into InlinedCallFrame.m_Datum, the offset of which is given by offsetOfCallTarget.
3641 GenTreeLclFld* store =
3642 new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
3643 callFrameInfo.offsetOfCallTarget);
3644 store->gtOp1 = src;
3645 store->gtFlags |= GTF_VAR_DEF;
3646
3647 InsertTreeBeforeAndContainCheck(insertBefore, store);
3648 }
3649
3650#ifdef _TARGET_X86_
3651
3652 // ----------------------------------------------------------------------------------
3653 // InlinedCallFrame.m_pCallSiteSP = SP
3654
3655 GenTreeLclFld* storeCallSiteSP = new (comp, GT_STORE_LCL_FLD)
3656 GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfCallSiteSP);
3657
3658 storeCallSiteSP->gtOp1 = PhysReg(REG_SPBASE);
3659 storeCallSiteSP->gtFlags |= GTF_VAR_DEF;
3660
3661 InsertTreeBeforeAndContainCheck(insertBefore, storeCallSiteSP);
3662
3663#endif
3664
3665 // ----------------------------------------------------------------------------------
3666 // InlinedCallFrame.m_pCallerReturnAddress = &label (the address of the instruction immediately following the call)
3667
3668 GenTreeLclFld* storeLab =
3669 new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
3670 callFrameInfo.offsetOfReturnAddress);
3671
3672 // We don't have a real label, and inserting one is hard (even if we made a special node),
3673 // so for now we will just 'know' what this means in codegen.
3674 GenTreeLabel* labelRef = new (comp, GT_LABEL) GenTreeLabel(nullptr);
3675 labelRef->gtType = TYP_I_IMPL;
3676 storeLab->gtOp1 = labelRef;
3677 storeLab->gtFlags |= GTF_VAR_DEF;
3678
3679 InsertTreeBeforeAndContainCheck(insertBefore, storeLab);
3680
3681 // Push the PInvoke frame if necessary. On 32-bit targets this only happens in the method prolog if a method
3682 // contains PInvokes; on 64-bit targets this is necessary in non-stubs.
3683 CLANG_FORMAT_COMMENT_ANCHOR;
3684
3685#ifdef _TARGET_64BIT_
3686 if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
3687 {
3688 // Set the TCB's frame to be the one we just created.
3689 // Note the init routine for the InlinedCallFrame (CORINFO_HELP_INIT_PINVOKE_FRAME)
3690 // has prepended it to the linked list to maintain the stack of Frames.
3691 //
3692 // Stubs do this once per stub, not once per call.
3693 GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
3694 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, frameUpd));
3695 ContainCheckStoreIndir(frameUpd->AsIndir());
3696 }
3697#endif // _TARGET_64BIT_
3698
3699 // IMPORTANT **** This instruction must come last!!! ****
3700 // It changes the thread's state to Preemptive mode
3701 // ----------------------------------------------------------------------------------
3702 // [tcb + offsetOfGcState] = 0
3703
3704 GenTree* storeGCState = SetGCState(0);
3705 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, storeGCState));
3706 ContainCheckStoreIndir(storeGCState->AsIndir());
3707}
3708
3709//------------------------------------------------------------------------
3710// InsertPInvokeCallEpilog: Insert the code that goes after every inlined pinvoke call.
3711//
3712// Arguments:
3713// call - the call for which we are inserting the PInvoke epilog.
3714//
3715// Return Value:
3716// None.
3717//
3718void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
3719{
3720 JITDUMP("======= Inserting PInvoke call epilog\n");
3721
3722 if (comp->opts.ShouldUsePInvokeHelpers())
3723 {
3724 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
3725
3726 // First argument is the address of the frame variable.
3727 GenTree* frameAddr =
3728 new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET);
3729 frameAddr->SetOperRaw(GT_LCL_VAR_ADDR);
3730
3731 // Insert call to CORINFO_HELP_JIT_PINVOKE_END
3732 GenTreeCall* helperCall =
3733 comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_END, TYP_VOID, comp->gtNewArgList(frameAddr));
3734
3735 comp->fgMorphTree(helperCall);
3736 BlockRange().InsertAfter(call, LIR::SeqTree(comp, helperCall));
3737 ContainCheckCallOperands(helperCall);
3738 return;
3739 }
3740
3741 // gcstate = 1
3742 GenTree* insertionPoint = call->gtNext;
3743
3744 GenTree* tree = SetGCState(1);
3745 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
3746 ContainCheckStoreIndir(tree->AsIndir());
3747
3748 tree = CreateReturnTrapSeq();
3749 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
3750 ContainCheckReturnTrap(tree->AsOp());
3751
3752 // Pop the frame if necessary. On 32-bit targets this only happens in the method epilog; on 64-bit targets thi
3753 // happens after every PInvoke call in non-stubs. 32-bit targets instead mark the frame as inactive.
3754 CLANG_FORMAT_COMMENT_ANCHOR;
3755
3756#ifdef _TARGET_64BIT_
3757 if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
3758 {
3759 tree = CreateFrameLinkUpdate(PopFrame);
3760 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
3761 ContainCheckStoreIndir(tree->AsIndir());
3762 }
3763#else
3764 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
3765
3766 // ----------------------------------------------------------------------------------
3767 // InlinedCallFrame.m_pCallerReturnAddress = nullptr
3768
3769 GenTreeLclFld* const storeCallSiteTracker =
3770 new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
3771 callFrameInfo.offsetOfReturnAddress);
3772
3773 GenTreeIntCon* const constantZero = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_I_IMPL, 0);
3774
3775 storeCallSiteTracker->gtOp1 = constantZero;
3776 storeCallSiteTracker->gtFlags |= GTF_VAR_DEF;
3777
3778 BlockRange().InsertBefore(insertionPoint, constantZero, storeCallSiteTracker);
3779 ContainCheckStoreLoc(storeCallSiteTracker);
3780#endif // _TARGET_64BIT_
3781}
3782
3783//------------------------------------------------------------------------
3784// LowerNonvirtPinvokeCall: Lower a non-virtual / indirect PInvoke call
3785//
3786// Arguments:
3787// call - The call to lower.
3788//
3789// Return Value:
3790// The lowered call tree.
3791//
3792GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call)
3793{
3794 // PInvoke lowering varies depending on the flags passed in by the EE. By default,
3795 // GC transitions are generated inline; if CORJIT_FLAG_USE_PINVOKE_HELPERS is specified,
3796 // GC transitions are instead performed using helper calls. Examples of each case are given
3797 // below. Note that the data structure that is used to store information about a call frame
3798 // containing any P/Invoke calls is initialized in the method prolog (see
3799 // InsertPInvokeMethod{Prolog,Epilog} for details).
3800 //
3801 // Inline transitions:
3802 // InlinedCallFrame inlinedCallFrame;
3803 //
3804 // ...
3805 //
3806 // // Set up frame information
3807 // inlinedCallFrame.callTarget = methodHandle; // stored in m_Datum
3808 // inlinedCallFrame.m_pCallSiteSP = SP; // x86 only
3809 // inlinedCallFrame.m_pCallerReturnAddress = &label; (the address of the instruction immediately following the
3810 // call)
3811 // Thread.m_pFrame = &inlinedCallFrame; (non-IL-stub only)
3812 //
3813 // // Switch the thread's GC mode to preemptive mode
3814 // thread->m_fPreemptiveGCDisabled = 0;
3815 //
3816 // // Call the unmanaged method
3817 // target();
3818 //
3819 // // Switch the thread's GC mode back to cooperative mode
3820 // thread->m_fPreemptiveGCDisabled = 1;
3821 //
3822 // // Rendezvous with a running collection if necessary
3823 // if (g_TrapReturningThreads)
3824 // RareDisablePreemptiveGC();
3825 //
3826 // Transistions using helpers:
3827 //
3828 // OpaqueFrame opaqueFrame;
3829 //
3830 // ...
3831 //
3832 // // Call the JIT_PINVOKE_BEGIN helper
3833 // JIT_PINVOKE_BEGIN(&opaqueFrame);
3834 //
3835 // // Call the unmanaged method
3836 // target();
3837 //
3838 // // Call the JIT_PINVOKE_END helper
3839 // JIT_PINVOKE_END(&opaqueFrame);
3840 //
3841 // Note that the JIT_PINVOKE_{BEGIN.END} helpers currently use the default calling convention for the target
3842 // platform. They may be changed in the future such that they preserve all register values.
3843
3844 GenTree* result = nullptr;
3845 void* addr = nullptr;
3846
3847 // assert we have seen one of these
3848 noway_assert(comp->info.compCallUnmanaged != 0);
3849
3850 // All code generated by this function must not contain the randomly-inserted NOPs
3851 // that we insert to inhibit JIT spraying in partial trust scenarios.
3852 // The PINVOKE_PROLOG op signals this to the code generator/emitter.
3853
3854 GenTree* prolog = new (comp, GT_NOP) GenTree(GT_PINVOKE_PROLOG, TYP_VOID);
3855 BlockRange().InsertBefore(call, prolog);
3856
3857 InsertPInvokeCallProlog(call);
3858
3859 if (call->gtCallType != CT_INDIRECT)
3860 {
3861 noway_assert(call->gtCallType == CT_USER_FUNC);
3862 CORINFO_METHOD_HANDLE methHnd = call->gtCallMethHnd;
3863
3864 CORINFO_CONST_LOOKUP lookup;
3865 comp->info.compCompHnd->getAddressOfPInvokeTarget(methHnd, &lookup);
3866
3867 void* addr = lookup.addr;
3868 switch (lookup.accessType)
3869 {
3870 case IAT_VALUE:
3871 if (!IsCallTargetInRange(addr))
3872 {
3873 result = AddrGen(addr);
3874 }
3875 else
3876 {
3877 // a direct call within range of hardware relative call instruction
3878 // stash the address for codegen
3879 call->gtDirectCallAddress = addr;
3880#ifdef FEATURE_READYTORUN_COMPILER
3881 call->gtEntryPoint.addr = nullptr;
3882 call->gtEntryPoint.accessType = IAT_VALUE;
3883#endif
3884 }
3885 break;
3886
3887 case IAT_PVALUE:
3888 result = Ind(AddrGen(addr));
3889 break;
3890
3891 case IAT_PPVALUE:
3892 result = Ind(Ind(AddrGen(addr)));
3893 break;
3894
3895 case IAT_RELPVALUE:
3896 unreached();
3897 }
3898 }
3899
3900 InsertPInvokeCallEpilog(call);
3901
3902 return result;
3903}
3904
3905// Expand the code necessary to calculate the control target.
3906// Returns: the expression needed to calculate the control target
3907// May insert embedded statements
3908GenTree* Lowering::LowerVirtualVtableCall(GenTreeCall* call)
3909{
3910 noway_assert(call->gtCallType == CT_USER_FUNC);
3911
3912 // If this is a tail call via helper, thisPtr will be the third argument.
3913 int thisPtrArgNum;
3914 regNumber thisPtrArgReg;
3915
3916#ifndef _TARGET_X86_ // x86 tailcall via helper follows normal calling convention, but with extra stack args.
3917 if (call->IsTailCallViaHelper())
3918 {
3919 thisPtrArgNum = 2;
3920 thisPtrArgReg = REG_ARG_2;
3921 }
3922 else
3923#endif // !_TARGET_X86_
3924 {
3925 thisPtrArgNum = 0;
3926 thisPtrArgReg = comp->codeGen->genGetThisArgReg(call);
3927 }
3928
3929 // get a reference to the thisPtr being passed
3930 fgArgTabEntry* argEntry = comp->gtArgEntryByArgNum(call, thisPtrArgNum);
3931 assert(argEntry->regNum == thisPtrArgReg);
3932 assert(argEntry->node->gtOper == GT_PUTARG_REG);
3933 GenTree* thisPtr = argEntry->node->gtOp.gtOp1;
3934
3935 // If what we are passing as the thisptr is not already a local, make a new local to place it in
3936 // because we will be creating expressions based on it.
3937 unsigned lclNum;
3938 if (thisPtr->IsLocal())
3939 {
3940 lclNum = thisPtr->gtLclVarCommon.gtLclNum;
3941 }
3942 else
3943 {
3944 // Split off the thisPtr and store to a temporary variable.
3945 if (vtableCallTemp == BAD_VAR_NUM)
3946 {
3947 vtableCallTemp = comp->lvaGrabTemp(true DEBUGARG("virtual vtable call"));
3948 }
3949
3950 LIR::Use thisPtrUse(BlockRange(), &(argEntry->node->gtOp.gtOp1), argEntry->node);
3951 ReplaceWithLclVar(thisPtrUse, vtableCallTemp);
3952
3953 lclNum = vtableCallTemp;
3954 }
3955
3956 // Get hold of the vtable offset (note: this might be expensive)
3957 unsigned vtabOffsOfIndirection;
3958 unsigned vtabOffsAfterIndirection;
3959 bool isRelative;
3960 comp->info.compCompHnd->getMethodVTableOffset(call->gtCallMethHnd, &vtabOffsOfIndirection,
3961 &vtabOffsAfterIndirection, &isRelative);
3962
3963 // If the thisPtr is a local field, then construct a local field type node
3964 GenTree* local;
3965 if (thisPtr->isLclField())
3966 {
3967 local = new (comp, GT_LCL_FLD)
3968 GenTreeLclFld(GT_LCL_FLD, thisPtr->TypeGet(), lclNum, thisPtr->AsLclFld()->gtLclOffs);
3969 }
3970 else
3971 {
3972 local = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, thisPtr->TypeGet(), lclNum, BAD_IL_OFFSET);
3973 }
3974
3975 // pointer to virtual table = [REG_CALL_THIS + offs]
3976 GenTree* result = Ind(Offset(local, VPTR_OFFS));
3977
3978 // Get the appropriate vtable chunk
3979 if (vtabOffsOfIndirection != CORINFO_VIRTUALCALL_NO_CHUNK)
3980 {
3981 if (isRelative)
3982 {
3983 // MethodTable offset is a relative pointer.
3984 //
3985 // Additional temporary variable is used to store virtual table pointer.
3986 // Address of method is obtained by the next computations:
3987 //
3988 // Save relative offset to tmp (vtab is virtual table pointer, vtabOffsOfIndirection is offset of
3989 // vtable-1st-level-indirection):
3990 // tmp = vtab
3991 //
3992 // Save address of method to result (vtabOffsAfterIndirection is offset of vtable-2nd-level-indirection):
3993 // result = [tmp + vtabOffsOfIndirection + vtabOffsAfterIndirection + [tmp + vtabOffsOfIndirection]]
3994 //
3995 //
3996 // If relative pointers are also in second level indirection, additional temporary is used:
3997 // tmp1 = vtab
3998 // tmp2 = tmp1 + vtabOffsOfIndirection + vtabOffsAfterIndirection + [tmp1 + vtabOffsOfIndirection]
3999 // result = tmp2 + [tmp2]
4000 //
4001 unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp"));
4002 unsigned lclNumTmp2 = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp2"));
4003
4004 GenTree* lclvNodeStore = comp->gtNewTempAssign(lclNumTmp, result);
4005
4006 GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, result->TypeGet());
4007 tmpTree = Offset(tmpTree, vtabOffsOfIndirection);
4008
4009 tmpTree = comp->gtNewOperNode(GT_IND, TYP_I_IMPL, tmpTree, false);
4010 GenTree* offs = comp->gtNewIconNode(vtabOffsOfIndirection + vtabOffsAfterIndirection, TYP_INT);
4011 result = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, comp->gtNewLclvNode(lclNumTmp, result->TypeGet()), offs);
4012
4013 GenTree* base = OffsetByIndexWithScale(result, tmpTree, 1);
4014 GenTree* lclvNodeStore2 = comp->gtNewTempAssign(lclNumTmp2, base);
4015
4016 LIR::Range range = LIR::SeqTree(comp, lclvNodeStore);
4017 JITDUMP("result of obtaining pointer to virtual table:\n");
4018 DISPRANGE(range);
4019 BlockRange().InsertBefore(call, std::move(range));
4020
4021 LIR::Range range2 = LIR::SeqTree(comp, lclvNodeStore2);
4022 JITDUMP("result of obtaining pointer to virtual table 2nd level indirection:\n");
4023 DISPRANGE(range2);
4024 BlockRange().InsertAfter(lclvNodeStore, std::move(range2));
4025
4026 result = Ind(comp->gtNewLclvNode(lclNumTmp2, result->TypeGet()));
4027 result =
4028 comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, result, comp->gtNewLclvNode(lclNumTmp2, result->TypeGet()));
4029 }
4030 else
4031 {
4032 // result = [REG_CALL_IND_SCRATCH + vtabOffsOfIndirection]
4033 result = Ind(Offset(result, vtabOffsOfIndirection));
4034 }
4035 }
4036 else
4037 {
4038 assert(!isRelative);
4039 }
4040
4041 // Load the function address
4042 // result = [reg+vtabOffs]
4043 if (!isRelative)
4044 {
4045 result = Ind(Offset(result, vtabOffsAfterIndirection));
4046 }
4047
4048 return result;
4049}
4050
4051// Lower stub dispatched virtual calls.
4052GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call)
4053{
4054 assert(call->IsVirtualStub());
4055
4056 // An x86 JIT which uses full stub dispatch must generate only
4057 // the following stub dispatch calls:
4058 //
4059 // (1) isCallRelativeIndirect:
4060 // call dword ptr [rel32] ; FF 15 ---rel32----
4061 // (2) isCallRelative:
4062 // call abc ; E8 ---rel32----
4063 // (3) isCallRegisterIndirect:
4064 // 3-byte nop ;
4065 // call dword ptr [eax] ; FF 10
4066 //
4067 // THIS IS VERY TIGHTLY TIED TO THE PREDICATES IN
4068 // vm\i386\cGenCpu.h, esp. isCallRegisterIndirect.
4069
4070 GenTree* result = nullptr;
4071
4072#ifdef _TARGET_64BIT_
4073 // Non-tail calls: Jump Stubs are not taken into account by VM for mapping an AV into a NullRef
4074 // exception. Therefore, JIT needs to emit an explicit null check. Note that Jit64 too generates
4075 // an explicit null check.
4076 //
4077 // Tail calls: fgMorphTailCall() materializes null check explicitly and hence no need to emit
4078 // null check.
4079
4080 // Non-64-bit: No need to null check the this pointer - the dispatch code will deal with this.
4081 // The VM considers exceptions that occur in stubs on 64-bit to be not managed exceptions and
4082 // it would be difficult to change this in a way so that it affects only the right stubs.
4083
4084 if (!call->IsTailCallViaHelper())
4085 {
4086 call->gtFlags |= GTF_CALL_NULLCHECK;
4087 }
4088#endif
4089
4090 // This is code to set up an indirect call to a stub address computed
4091 // via dictionary lookup.
4092 if (call->gtCallType == CT_INDIRECT)
4093 {
4094 // The importer decided we needed a stub call via a computed
4095 // stub dispatch address, i.e. an address which came from a dictionary lookup.
4096 // - The dictionary lookup produces an indirected address, suitable for call
4097 // via "call [VirtualStubParam.reg]"
4098 //
4099 // This combination will only be generated for shared generic code and when
4100 // stub dispatch is active.
4101
4102 // fgMorphArgs will have created trees to pass the address in VirtualStubParam.reg.
4103 // All we have to do here is add an indirection to generate the actual call target.
4104
4105 GenTree* ind = Ind(call->gtCallAddr);
4106 BlockRange().InsertAfter(call->gtCallAddr, ind);
4107 call->gtCallAddr = ind;
4108
4109 ind->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
4110
4111 ContainCheckIndir(ind->AsIndir());
4112 }
4113 else
4114 {
4115 // Direct stub call.
4116 // Get stub addr. This will return NULL if virtual call stubs are not active
4117 void* stubAddr = call->gtStubCallStubAddr;
4118 noway_assert(stubAddr != nullptr);
4119
4120 // If not CT_INDIRECT, then it should always be relative indir call.
4121 // This is ensured by VM.
4122 noway_assert(call->IsVirtualStubRelativeIndir());
4123
4124 // Direct stub calls, though the stubAddr itself may still need to be
4125 // accessed via an indirection.
4126 GenTree* addr = AddrGen(stubAddr);
4127
4128#ifdef _TARGET_X86_
4129 // On x86, for tailcall via helper, the JIT_TailCall helper takes the stubAddr as
4130 // the target address, and we set a flag that it's a VSD call. The helper then
4131 // handles any necessary indirection.
4132 if (call->IsTailCallViaHelper())
4133 {
4134 result = addr;
4135 }
4136#endif // _TARGET_X86_
4137
4138 if (result == nullptr)
4139 {
4140 result = Ind(addr);
4141 }
4142 }
4143
4144 // TODO-Cleanup: start emitting random NOPS
4145 return result;
4146}
4147
4148//------------------------------------------------------------------------
4149// AddrModeCleanupHelper: Remove the nodes that are no longer used after an
4150// addressing mode is constructed
4151//
4152// Arguments:
4153// addrMode - A pointer to a new GenTreeAddrMode
4154// node - The node currently being considered for removal
4155//
4156// Return Value:
4157// None.
4158//
4159// Assumptions:
4160// 'addrMode' and 'node' must be contained in the current block
4161//
4162void Lowering::AddrModeCleanupHelper(GenTreeAddrMode* addrMode, GenTree* node)
4163{
4164 if (node == addrMode->Base() || node == addrMode->Index())
4165 {
4166 return;
4167 }
4168
4169 // TODO-LIR: change this to use the LIR mark bit and iterate instead of recursing
4170 node->VisitOperands([this, addrMode](GenTree* operand) -> GenTree::VisitResult {
4171 AddrModeCleanupHelper(addrMode, operand);
4172 return GenTree::VisitResult::Continue;
4173 });
4174
4175 BlockRange().Remove(node);
4176}
4177
4178//------------------------------------------------------------------------
4179// Lowering::AreSourcesPossibleModifiedLocals:
4180// Given two nodes which will be used in an addressing mode (base,
4181// index), check to see if they are lclVar reads, and if so, walk
4182// backwards from the use until both reads have been visited to
4183// determine if they are potentially modified in that range.
4184//
4185// Arguments:
4186// addr - the node that uses the base and index nodes
4187// base - the base node
4188// index - the index node
4189//
4190// Returns: true if either the base or index may be modified between the
4191// node and addr.
4192//
4193bool Lowering::AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, GenTree* index)
4194{
4195 assert(addr != nullptr);
4196
4197 unsigned markCount = 0;
4198
4199 SideEffectSet baseSideEffects;
4200 if (base != nullptr)
4201 {
4202 if (base->OperIsLocalRead())
4203 {
4204 baseSideEffects.AddNode(comp, base);
4205 }
4206 else
4207 {
4208 base = nullptr;
4209 }
4210 }
4211
4212 SideEffectSet indexSideEffects;
4213 if (index != nullptr)
4214 {
4215 if (index->OperIsLocalRead())
4216 {
4217 indexSideEffects.AddNode(comp, index);
4218 }
4219 else
4220 {
4221 index = nullptr;
4222 }
4223 }
4224
4225 for (GenTree* cursor = addr;; cursor = cursor->gtPrev)
4226 {
4227 assert(cursor != nullptr);
4228
4229 if (cursor == base)
4230 {
4231 base = nullptr;
4232 }
4233
4234 if (cursor == index)
4235 {
4236 index = nullptr;
4237 }
4238
4239 if ((base == nullptr) && (index == nullptr))
4240 {
4241 return false;
4242 }
4243
4244 m_scratchSideEffects.Clear();
4245 m_scratchSideEffects.AddNode(comp, cursor);
4246 if ((base != nullptr) && m_scratchSideEffects.InterferesWith(baseSideEffects, false))
4247 {
4248 return true;
4249 }
4250
4251 if ((index != nullptr) && m_scratchSideEffects.InterferesWith(indexSideEffects, false))
4252 {
4253 return true;
4254 }
4255 }
4256}
4257
4258//------------------------------------------------------------------------
4259// TryCreateAddrMode: recognize trees which can be implemented using an
4260// addressing mode and transform them to a GT_LEA
4261//
4262// Arguments:
4263// use: the use of the address we want to transform
4264// isIndir: true if this addressing mode is the child of an indir
4265//
4266// Returns:
4267// The created LEA node or the original address node if an LEA could
4268// not be formed.
4269//
4270GenTree* Lowering::TryCreateAddrMode(LIR::Use&& use, bool isIndir)
4271{
4272 GenTree* addr = use.Def();
4273 GenTree* base = nullptr;
4274 GenTree* index = nullptr;
4275 unsigned scale = 0;
4276 ssize_t offset = 0;
4277 bool rev = false;
4278
4279 // TODO-1stClassStructs: This logic is here to preserve prior behavior. Note that previously
4280 // block ops were not considered for addressing modes, but an add under it may have been.
4281 // This should be replaced with logic that more carefully determines when an addressing mode
4282 // would be beneficial for a block op.
4283 if (isIndir)
4284 {
4285 GenTree* indir = use.User();
4286 if (indir->TypeGet() == TYP_STRUCT)
4287 {
4288 isIndir = false;
4289 }
4290 else if (varTypeIsStruct(indir))
4291 {
4292 // We can have an indirection on the rhs of a block copy (it is the source
4293 // object). This is not a "regular" indirection.
4294 // (Note that the user check could be costly.)
4295 LIR::Use indirUse;
4296 if (BlockRange().TryGetUse(indir, &indirUse) && indirUse.User()->OperIsIndir())
4297 {
4298 isIndir = false;
4299 }
4300 else
4301 {
4302 isIndir = !indir->OperIsBlk();
4303 }
4304 }
4305 }
4306
4307 // Find out if an addressing mode can be constructed
4308 bool doAddrMode = comp->codeGen->genCreateAddrMode(addr, // address
4309 true, // fold
4310 &rev, // reverse ops
4311 &base, // base addr
4312 &index, // index val
4313#if SCALED_ADDR_MODES
4314 &scale, // scaling
4315#endif // SCALED_ADDR_MODES
4316 &offset); // displacement
4317
4318 if (scale == 0)
4319 {
4320 scale = 1;
4321 }
4322
4323 if (!isIndir)
4324 {
4325 // this is just a reg-const add
4326 if (index == nullptr)
4327 {
4328 return addr;
4329 }
4330
4331 // this is just a reg-reg add
4332 if (scale == 1 && offset == 0)
4333 {
4334 return addr;
4335 }
4336 }
4337
4338 // make sure there are not any side effects between def of leaves and use
4339 if (!doAddrMode || AreSourcesPossiblyModifiedLocals(addr, base, index))
4340 {
4341 JITDUMP("No addressing mode:\n ");
4342 DISPNODE(addr);
4343 return addr;
4344 }
4345
4346 GenTree* arrLength = nullptr;
4347
4348 JITDUMP("Addressing mode:\n");
4349 JITDUMP(" Base\n ");
4350 DISPNODE(base);
4351 if (index != nullptr)
4352 {
4353 JITDUMP(" + Index * %u + %d\n ", scale, offset);
4354 DISPNODE(index);
4355 }
4356 else
4357 {
4358 JITDUMP(" + %d\n", offset);
4359 }
4360
4361 var_types addrModeType = addr->TypeGet();
4362 if (addrModeType == TYP_REF)
4363 {
4364 addrModeType = TYP_BYREF;
4365 }
4366
4367 GenTreeAddrMode* addrMode = new (comp, GT_LEA) GenTreeAddrMode(addrModeType, base, index, scale, offset);
4368
4369 // Neither the base nor the index should now be contained.
4370 if (base != nullptr)
4371 {
4372 base->ClearContained();
4373 }
4374 if (index != nullptr)
4375 {
4376 index->ClearContained();
4377 }
4378 addrMode->gtFlags |= (addr->gtFlags & GTF_IND_FLAGS);
4379 addrMode->gtFlags &= ~GTF_ALL_EFFECT; // LEAs are side-effect-free.
4380
4381 JITDUMP("New addressing mode node:\n");
4382 DISPNODE(addrMode);
4383 JITDUMP("\n");
4384
4385 BlockRange().InsertAfter(addr, addrMode);
4386
4387 // Now we need to remove all the nodes subsumed by the addrMode
4388 AddrModeCleanupHelper(addrMode, addr);
4389
4390 // Replace the original address node with the addrMode.
4391 use.ReplaceWith(comp, addrMode);
4392
4393 return addrMode;
4394}
4395
4396//------------------------------------------------------------------------
4397// LowerAdd: turn this add into a GT_LEA if that would be profitable
4398//
4399// Arguments:
4400// node - the node we care about
4401//
4402// Returns:
4403// The next node to lower if we have transformed the ADD; nullptr otherwise.
4404//
4405GenTree* Lowering::LowerAdd(GenTree* node)
4406{
4407 GenTree* next = node->gtNext;
4408
4409#ifndef _TARGET_ARMARCH_
4410 if (varTypeIsIntegralOrI(node))
4411 {
4412 LIR::Use use;
4413 if (BlockRange().TryGetUse(node, &use))
4414 {
4415 // If this is a child of an indir, let the parent handle it.
4416 // If there is a chain of adds, only look at the topmost one.
4417 GenTree* parent = use.User();
4418 if (!parent->OperIsIndir() && (parent->gtOper != GT_ADD))
4419 {
4420 GenTree* addr = TryCreateAddrMode(std::move(use), false);
4421 if (addr != node)
4422 {
4423 return addr->gtNext;
4424 }
4425 }
4426 }
4427 }
4428#endif // !_TARGET_ARMARCH_
4429
4430 return nullptr;
4431}
4432
4433//------------------------------------------------------------------------
4434// LowerUnsignedDivOrMod: Lowers a GT_UDIV/GT_UMOD node.
4435//
4436// Arguments:
4437// divMod - pointer to the GT_UDIV/GT_UMOD node to be lowered
4438//
4439// Return Value:
4440// Returns a boolean indicating whether the node was transformed.
4441//
4442// Notes:
4443// - Transform UDIV/UMOD by power of 2 into RSZ/AND
4444// - Transform UDIV by constant >= 2^(N-1) into GE
4445// - Transform UDIV/UMOD by constant >= 3 into "magic division"
4446//
4447
4448bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
4449{
4450 assert(divMod->OperIs(GT_UDIV, GT_UMOD));
4451
4452#if defined(USE_HELPERS_FOR_INT_DIV)
4453 if (!varTypeIsIntegral(divMod->TypeGet()))
4454 {
4455 assert(!"unreachable: integral GT_UDIV/GT_UMOD should get morphed into helper calls");
4456 }
4457 assert(varTypeIsFloating(divMod->TypeGet()));
4458#endif // USE_HELPERS_FOR_INT_DIV
4459#if defined(_TARGET_ARM64_)
4460 assert(divMod->OperGet() != GT_UMOD);
4461#endif // _TARGET_ARM64_
4462
4463 GenTree* next = divMod->gtNext;
4464 GenTree* dividend = divMod->gtGetOp1();
4465 GenTree* divisor = divMod->gtGetOp2();
4466
4467#if !defined(_TARGET_64BIT_)
4468 if (dividend->OperIs(GT_LONG))
4469 {
4470 return false;
4471 }
4472#endif
4473
4474 if (!divisor->IsCnsIntOrI())
4475 {
4476 return false;
4477 }
4478
4479 if (dividend->IsCnsIntOrI())
4480 {
4481 // We shouldn't see a divmod with constant operands here but if we do then it's likely
4482 // because optimizations are disabled or it's a case that's supposed to throw an exception.
4483 // Don't optimize this.
4484 return false;
4485 }
4486
4487 const var_types type = divMod->TypeGet();
4488 assert((type == TYP_INT) || (type == TYP_I_IMPL));
4489
4490 size_t divisorValue = static_cast<size_t>(divisor->AsIntCon()->IconValue());
4491
4492 if (type == TYP_INT)
4493 {
4494 // Clear up the upper 32 bits of the value, they may be set to 1 because constants
4495 // are treated as signed and stored in ssize_t which is 64 bit in size on 64 bit targets.
4496 divisorValue &= UINT32_MAX;
4497 }
4498
4499 if (divisorValue == 0)
4500 {
4501 return false;
4502 }
4503
4504 const bool isDiv = divMod->OperIs(GT_UDIV);
4505
4506 if (isPow2(divisorValue))
4507 {
4508 genTreeOps newOper;
4509
4510 if (isDiv)
4511 {
4512 newOper = GT_RSZ;
4513 divisorValue = genLog2(divisorValue);
4514 }
4515 else
4516 {
4517 newOper = GT_AND;
4518 divisorValue -= 1;
4519 }
4520
4521 divMod->SetOper(newOper);
4522 divisor->gtIntCon.SetIconValue(divisorValue);
4523 ContainCheckNode(divMod);
4524 return true;
4525 }
4526 if (isDiv)
4527 {
4528 // If the divisor is greater or equal than 2^(N - 1) then the result is 1
4529 // iff the dividend is greater or equal than the divisor.
4530 if (((type == TYP_INT) && (divisorValue > (UINT32_MAX / 2))) ||
4531 ((type == TYP_LONG) && (divisorValue > (UINT64_MAX / 2))))
4532 {
4533 divMod->SetOper(GT_GE);
4534 divMod->gtFlags |= GTF_UNSIGNED;
4535 ContainCheckNode(divMod);
4536 return true;
4537 }
4538 }
4539
4540// TODO-ARM-CQ: Currently there's no GT_MULHI for ARM32
4541#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
4542 if (!comp->opts.MinOpts() && (divisorValue >= 3))
4543 {
4544 size_t magic;
4545 bool add;
4546 int shift;
4547
4548 if (type == TYP_INT)
4549 {
4550 magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
4551 }
4552 else
4553 {
4554#ifdef _TARGET_64BIT_
4555 magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
4556#else
4557 unreached();
4558#endif
4559 }
4560
4561 // Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
4562 // add == false (when divisor == 3 for example):
4563 // div = (dividend MULHI magic) RSZ shift
4564 // add == true (when divisor == 7 for example):
4565 // mulhi = dividend MULHI magic
4566 // div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
4567 const bool requiresAdjustment = add;
4568 const bool requiresDividendMultiuse = requiresAdjustment || !isDiv;
4569 const unsigned curBBWeight = m_block->getBBWeight(comp);
4570
4571 if (requiresDividendMultiuse)
4572 {
4573 LIR::Use dividendUse(BlockRange(), &divMod->gtOp1, divMod);
4574 dividend = ReplaceWithLclVar(dividendUse);
4575 }
4576
4577 // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
4578 // The existing node will later be transformed into a GT_RSZ/GT_SUB that
4579 // computes the final result. This way don't need to find and change the use
4580 // of the existing node.
4581 GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
4582 mulhi->gtFlags |= GTF_UNSIGNED;
4583 divisor->AsIntCon()->SetIconValue(magic);
4584 BlockRange().InsertBefore(divMod, mulhi);
4585 GenTree* firstNode = mulhi;
4586
4587 if (requiresAdjustment)
4588 {
4589 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
4590 GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
4591 BlockRange().InsertBefore(divMod, dividend, sub);
4592
4593 GenTree* one = comp->gtNewIconNode(1, TYP_INT);
4594 GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
4595 BlockRange().InsertBefore(divMod, one, rsz);
4596
4597 LIR::Use mulhiUse(BlockRange(), &sub->gtOp.gtOp2, sub);
4598 mulhi = ReplaceWithLclVar(mulhiUse);
4599
4600 mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
4601 GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
4602 BlockRange().InsertBefore(divMod, mulhi, add);
4603
4604 mulhi = add;
4605 shift -= 1;
4606 }
4607
4608 GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
4609 BlockRange().InsertBefore(divMod, shiftBy);
4610
4611 if (isDiv)
4612 {
4613 divMod->SetOper(GT_RSZ);
4614 divMod->gtOp1 = mulhi;
4615 divMod->gtOp2 = shiftBy;
4616 }
4617 else
4618 {
4619 GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
4620
4621 // divisor UMOD dividend = dividend SUB (div MUL divisor)
4622 GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
4623 GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
4624 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
4625
4626 divMod->SetOper(GT_SUB);
4627 divMod->gtOp1 = dividend;
4628 divMod->gtOp2 = mul;
4629
4630 BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
4631 }
4632 ContainCheckRange(firstNode, divMod);
4633
4634 return true;
4635 }
4636#endif
4637 return false;
4638}
4639
4640// LowerConstIntDivOrMod: Transform integer GT_DIV/GT_MOD nodes with a power of 2
4641// const divisor into equivalent but faster sequences.
4642//
4643// Arguments:
4644// node - pointer to the DIV or MOD node
4645//
4646// Returns:
4647// nullptr if no transformation is done, or the next node in the transformed node sequence that
4648// needs to be lowered.
4649//
4650GenTree* Lowering::LowerConstIntDivOrMod(GenTree* node)
4651{
4652 assert((node->OperGet() == GT_DIV) || (node->OperGet() == GT_MOD));
4653 GenTree* divMod = node;
4654 GenTree* dividend = divMod->gtGetOp1();
4655 GenTree* divisor = divMod->gtGetOp2();
4656
4657 const var_types type = divMod->TypeGet();
4658 assert((type == TYP_INT) || (type == TYP_LONG));
4659
4660#if defined(USE_HELPERS_FOR_INT_DIV)
4661 assert(!"unreachable: integral GT_DIV/GT_MOD should get morphed into helper calls");
4662#endif // USE_HELPERS_FOR_INT_DIV
4663#if defined(_TARGET_ARM64_)
4664 assert(node->OperGet() != GT_MOD);
4665#endif // _TARGET_ARM64_
4666
4667 if (!divisor->IsCnsIntOrI())
4668 {
4669 return nullptr; // no transformations to make
4670 }
4671
4672 if (dividend->IsCnsIntOrI())
4673 {
4674 // We shouldn't see a divmod with constant operands here but if we do then it's likely
4675 // because optimizations are disabled or it's a case that's supposed to throw an exception.
4676 // Don't optimize this.
4677 return nullptr;
4678 }
4679
4680 ssize_t divisorValue = divisor->gtIntCon.IconValue();
4681
4682 if (divisorValue == -1 || divisorValue == 0)
4683 {
4684 // x / 0 and x % 0 can't be optimized because they are required to throw an exception.
4685
4686 // x / -1 can't be optimized because INT_MIN / -1 is required to throw an exception.
4687
4688 // x % -1 is always 0 and the IL spec says that the rem instruction "can" throw an exception if x is
4689 // the minimum representable integer. However, the C# spec says that an exception "is" thrown in this
4690 // case so optimizing this case would break C# code.
4691
4692 // A runtime check could be used to handle this case but it's probably too rare to matter.
4693 return nullptr;
4694 }
4695
4696 bool isDiv = divMod->OperGet() == GT_DIV;
4697
4698 if (isDiv)
4699 {
4700 if ((type == TYP_INT && divisorValue == INT_MIN) || (type == TYP_LONG && divisorValue == INT64_MIN))
4701 {
4702 // If the divisor is the minimum representable integer value then we can use a compare,
4703 // the result is 1 iff the dividend equals divisor.
4704 divMod->SetOper(GT_EQ);
4705 return node;
4706 }
4707 }
4708
4709 size_t absDivisorValue =
4710 (divisorValue == SSIZE_T_MIN) ? static_cast<size_t>(divisorValue) : static_cast<size_t>(abs(divisorValue));
4711
4712 if (!isPow2(absDivisorValue))
4713 {
4714 if (comp->opts.MinOpts())
4715 {
4716 return nullptr;
4717 }
4718
4719#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
4720 ssize_t magic;
4721 int shift;
4722
4723 if (type == TYP_INT)
4724 {
4725 magic = MagicDivide::GetSigned32Magic(static_cast<int32_t>(divisorValue), &shift);
4726 }
4727 else
4728 {
4729#ifdef _TARGET_64BIT_
4730 magic = MagicDivide::GetSigned64Magic(static_cast<int64_t>(divisorValue), &shift);
4731#else // !_TARGET_64BIT_
4732 unreached();
4733#endif // !_TARGET_64BIT_
4734 }
4735
4736 divisor->gtIntConCommon.SetIconValue(magic);
4737
4738 // Insert a new GT_MULHI node in front of the existing GT_DIV/GT_MOD node.
4739 // The existing node will later be transformed into a GT_ADD/GT_SUB that
4740 // computes the final result. This way don't need to find and change the
4741 // use of the existing node.
4742 GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, divisor, dividend);
4743 BlockRange().InsertBefore(divMod, mulhi);
4744
4745 // mulhi was the easy part. Now we need to generate different code depending
4746 // on the divisor value:
4747 // For 3 we need:
4748 // div = signbit(mulhi) + mulhi
4749 // For 5 we need:
4750 // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
4751 // For 7 we need:
4752 // mulhi += dividend ; requires add adjust
4753 // div = signbit(mulhi) + sar(mulhi, 2) ; requires shift adjust
4754 // For -3 we need:
4755 // mulhi -= dividend ; requires sub adjust
4756 // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
4757 bool requiresAddSubAdjust = signum(divisorValue) != signum(magic);
4758 bool requiresShiftAdjust = shift != 0;
4759 bool requiresDividendMultiuse = requiresAddSubAdjust || !isDiv;
4760 unsigned curBBWeight = comp->compCurBB->getBBWeight(comp);
4761
4762 if (requiresDividendMultiuse)
4763 {
4764 LIR::Use dividendUse(BlockRange(), &mulhi->gtOp.gtOp2, mulhi);
4765 dividend = ReplaceWithLclVar(dividendUse);
4766 }
4767
4768 GenTree* adjusted;
4769
4770 if (requiresAddSubAdjust)
4771 {
4772 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
4773 adjusted = comp->gtNewOperNode(divisorValue > 0 ? GT_ADD : GT_SUB, type, mulhi, dividend);
4774 BlockRange().InsertBefore(divMod, dividend, adjusted);
4775 }
4776 else
4777 {
4778 adjusted = mulhi;
4779 }
4780
4781 GenTree* shiftBy = comp->gtNewIconNode(genTypeSize(type) * 8 - 1, type);
4782 GenTree* signBit = comp->gtNewOperNode(GT_RSZ, type, adjusted, shiftBy);
4783 BlockRange().InsertBefore(divMod, shiftBy, signBit);
4784
4785 LIR::Use adjustedUse(BlockRange(), &signBit->gtOp.gtOp1, signBit);
4786 adjusted = ReplaceWithLclVar(adjustedUse);
4787 adjusted = comp->gtNewLclvNode(adjusted->AsLclVar()->GetLclNum(), adjusted->TypeGet());
4788 BlockRange().InsertBefore(divMod, adjusted);
4789
4790 if (requiresShiftAdjust)
4791 {
4792 shiftBy = comp->gtNewIconNode(shift, TYP_INT);
4793 adjusted = comp->gtNewOperNode(GT_RSH, type, adjusted, shiftBy);
4794 BlockRange().InsertBefore(divMod, shiftBy, adjusted);
4795 }
4796
4797 if (isDiv)
4798 {
4799 divMod->SetOperRaw(GT_ADD);
4800 divMod->gtOp.gtOp1 = adjusted;
4801 divMod->gtOp.gtOp2 = signBit;
4802 }
4803 else
4804 {
4805 GenTree* div = comp->gtNewOperNode(GT_ADD, type, adjusted, signBit);
4806
4807 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
4808
4809 // divisor % dividend = dividend - divisor x div
4810 GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
4811 GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
4812 BlockRange().InsertBefore(divMod, dividend, div, divisor, mul);
4813
4814 divMod->SetOperRaw(GT_SUB);
4815 divMod->gtOp.gtOp1 = dividend;
4816 divMod->gtOp.gtOp2 = mul;
4817 }
4818
4819 return mulhi;
4820#elif defined(_TARGET_ARM_)
4821 // Currently there's no GT_MULHI for ARM32
4822 return nullptr;
4823#else
4824#error Unsupported or unset target architecture
4825#endif
4826 }
4827
4828 // We're committed to the conversion now. Go find the use if any.
4829 LIR::Use use;
4830 if (!BlockRange().TryGetUse(node, &use))
4831 {
4832 return nullptr;
4833 }
4834
4835 // We need to use the dividend node multiple times so its value needs to be
4836 // computed once and stored in a temp variable.
4837
4838 unsigned curBBWeight = comp->compCurBB->getBBWeight(comp);
4839
4840 LIR::Use opDividend(BlockRange(), &divMod->gtOp.gtOp1, divMod);
4841 dividend = ReplaceWithLclVar(opDividend);
4842
4843 GenTree* adjustment = comp->gtNewOperNode(GT_RSH, type, dividend, comp->gtNewIconNode(type == TYP_INT ? 31 : 63));
4844
4845 if (absDivisorValue == 2)
4846 {
4847 // If the divisor is +/-2 then we'd end up with a bitwise and between 0/-1 and 1.
4848 // We can get the same result by using GT_RSZ instead of GT_RSH.
4849 adjustment->SetOper(GT_RSZ);
4850 }
4851 else
4852 {
4853 adjustment = comp->gtNewOperNode(GT_AND, type, adjustment, comp->gtNewIconNode(absDivisorValue - 1, type));
4854 }
4855
4856 GenTree* adjustedDividend =
4857 comp->gtNewOperNode(GT_ADD, type, adjustment,
4858 comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()));
4859
4860 GenTree* newDivMod;
4861
4862 if (isDiv)
4863 {
4864 // perform the division by right shifting the adjusted dividend
4865 divisor->gtIntCon.SetIconValue(genLog2(absDivisorValue));
4866
4867 newDivMod = comp->gtNewOperNode(GT_RSH, type, adjustedDividend, divisor);
4868 ContainCheckShiftRotate(newDivMod->AsOp());
4869
4870 if (divisorValue < 0)
4871 {
4872 // negate the result if the divisor is negative
4873 newDivMod = comp->gtNewOperNode(GT_NEG, type, newDivMod);
4874 ContainCheckNode(newDivMod);
4875 }
4876 }
4877 else
4878 {
4879 // divisor % dividend = dividend - divisor x (dividend / divisor)
4880 // divisor x (dividend / divisor) translates to (dividend >> log2(divisor)) << log2(divisor)
4881 // which simply discards the low log2(divisor) bits, that's just dividend & ~(divisor - 1)
4882 divisor->gtIntCon.SetIconValue(~(absDivisorValue - 1));
4883
4884 newDivMod = comp->gtNewOperNode(GT_SUB, type,
4885 comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()),
4886 comp->gtNewOperNode(GT_AND, type, adjustedDividend, divisor));
4887 }
4888
4889 // Remove the divisor and dividend nodes from the linear order,
4890 // since we have reused them and will resequence the tree
4891 BlockRange().Remove(divisor);
4892 BlockRange().Remove(dividend);
4893
4894 // linearize and insert the new tree before the original divMod node
4895 InsertTreeBeforeAndContainCheck(divMod, newDivMod);
4896 BlockRange().Remove(divMod);
4897
4898 // replace the original divmod node with the new divmod tree
4899 use.ReplaceWith(comp, newDivMod);
4900
4901 return newDivMod->gtNext;
4902}
4903//------------------------------------------------------------------------
4904// LowerSignedDivOrMod: transform integer GT_DIV/GT_MOD nodes with a power of 2
4905// const divisor into equivalent but faster sequences.
4906//
4907// Arguments:
4908// node - the DIV or MOD node
4909//
4910// Returns:
4911// The next node to lower.
4912//
4913GenTree* Lowering::LowerSignedDivOrMod(GenTree* node)
4914{
4915 assert((node->OperGet() == GT_DIV) || (node->OperGet() == GT_MOD));
4916 GenTree* next = node->gtNext;
4917 GenTree* divMod = node;
4918 GenTree* dividend = divMod->gtGetOp1();
4919 GenTree* divisor = divMod->gtGetOp2();
4920
4921 if (varTypeIsIntegral(node->TypeGet()))
4922 {
4923 // LowerConstIntDivOrMod will return nullptr if it doesn't transform the node.
4924 GenTree* newNode = LowerConstIntDivOrMod(node);
4925 if (newNode != nullptr)
4926 {
4927 return newNode;
4928 }
4929 }
4930 ContainCheckDivOrMod(node->AsOp());
4931
4932 return next;
4933}
4934
4935//------------------------------------------------------------------------
4936// LowerShift: Lower shift nodes
4937//
4938// Arguments:
4939// shift - the shift node (GT_LSH, GT_RSH or GT_RSZ)
4940//
4941// Notes:
4942// Remove unnecessary shift count masking, xarch shift instructions
4943// mask the shift count to 5 bits (or 6 bits for 64 bit operations).
4944
4945void Lowering::LowerShift(GenTreeOp* shift)
4946{
4947 assert(shift->OperIs(GT_LSH, GT_RSH, GT_RSZ));
4948
4949 size_t mask = 0x1f;
4950#ifdef _TARGET_64BIT_
4951 if (varTypeIsLong(shift->TypeGet()))
4952 {
4953 mask = 0x3f;
4954 }
4955#else
4956 assert(!varTypeIsLong(shift->TypeGet()));
4957#endif
4958
4959 for (GenTree* andOp = shift->gtGetOp2(); andOp->OperIs(GT_AND); andOp = andOp->gtGetOp1())
4960 {
4961 GenTree* maskOp = andOp->gtGetOp2();
4962
4963 if (!maskOp->IsCnsIntOrI())
4964 {
4965 break;
4966 }
4967
4968 if ((static_cast<size_t>(maskOp->AsIntCon()->IconValue()) & mask) != mask)
4969 {
4970 break;
4971 }
4972
4973 shift->gtOp2 = andOp->gtGetOp1();
4974 BlockRange().Remove(andOp);
4975 BlockRange().Remove(maskOp);
4976 // The parent was replaced, clear contain and regOpt flag.
4977 shift->gtOp2->ClearContained();
4978 }
4979 ContainCheckShiftRotate(shift);
4980}
4981
4982void Lowering::WidenSIMD12IfNecessary(GenTreeLclVarCommon* node)
4983{
4984#ifdef FEATURE_SIMD
4985 if (node->TypeGet() == TYP_SIMD12)
4986 {
4987 // Assumption 1:
4988 // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
4989 // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
4990 // reading and writing purposes.
4991 //
4992 // Assumption 2:
4993 // RyuJit backend is making another implicit assumption that Vector3 type args when passed in
4994 // registers or on stack, the upper most 4-bytes will be zero.
4995 //
4996 // For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee
4997 // that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is
4998 // invalid.
4999 //
5000 // RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12
5001 // bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and
5002 // passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason,
5003 // there is no need to clear upper 4-bytes of Vector3 type args.
5004 //
5005 // RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16.
5006 // Vector3 return values are returned two return registers and Caller assembles them into a
5007 // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
5008 // type args in prolog and Vector3 type return value of a call
5009 //
5010 // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
5011 // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
5012 // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
5013 // it either.
5014
5015 unsigned varNum = node->AsLclVarCommon()->GetLclNum();
5016 LclVarDsc* varDsc = &comp->lvaTable[varNum];
5017
5018 if (comp->lvaMapSimd12ToSimd16(varDsc))
5019 {
5020 JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
5021 DISPNODE(node);
5022 JITDUMP("============");
5023
5024 node->gtType = TYP_SIMD16;
5025 }
5026 }
5027#endif // FEATURE_SIMD
5028}
5029
5030//------------------------------------------------------------------------
5031// LowerArrElem: Lower a GT_ARR_ELEM node
5032//
5033// Arguments:
5034// node - the GT_ARR_ELEM node to lower.
5035//
5036// Return Value:
5037// The next node to lower.
5038//
5039// Assumptions:
5040// pTree points to a pointer to a GT_ARR_ELEM node.
5041//
5042// Notes:
5043// This performs the following lowering. We start with a node of the form:
5044// /--* <arrObj>
5045// +--* <index0>
5046// +--* <index1>
5047// /--* arrMD&[,]
5048//
5049// First, we create temps for arrObj if it is not already a lclVar, and for any of the index
5050// expressions that have side-effects.
5051// We then transform the tree into:
5052// <offset is null - no accumulated offset for the first index>
5053// /--* <arrObj>
5054// +--* <index0>
5055// /--* ArrIndex[i, ]
5056// +--* <arrObj>
5057// /--| arrOffs[i, ]
5058// | +--* <arrObj>
5059// | +--* <index1>
5060// +--* ArrIndex[*,j]
5061// +--* <arrObj>
5062// /--| arrOffs[*,j]
5063// +--* lclVar NewTemp
5064// /--* lea (scale = element size, offset = offset of first element)
5065//
5066// The new stmtExpr may be omitted if the <arrObj> is a lclVar.
5067// The new stmtExpr may be embedded if the <arrObj> is not the first tree in linear order for
5068// the statement containing the original arrMD.
5069// Note that the arrMDOffs is the INDEX of the lea, but is evaluated before the BASE (which is the second
5070// reference to NewTemp), because that provides more accurate lifetimes.
5071// There may be 1, 2 or 3 dimensions, with 1, 2 or 3 arrMDIdx nodes, respectively.
5072//
5073GenTree* Lowering::LowerArrElem(GenTree* node)
5074{
5075 // This will assert if we don't have an ArrElem node
5076 GenTreeArrElem* arrElem = node->AsArrElem();
5077 const unsigned char rank = arrElem->gtArrElem.gtArrRank;
5078 const unsigned blockWeight = m_block->getBBWeight(comp);
5079
5080 JITDUMP("Lowering ArrElem\n");
5081 JITDUMP("============\n");
5082 DISPTREERANGE(BlockRange(), arrElem);
5083 JITDUMP("\n");
5084
5085 assert(arrElem->gtArrObj->TypeGet() == TYP_REF);
5086
5087 // We need to have the array object in a lclVar.
5088 if (!arrElem->gtArrObj->IsLocal())
5089 {
5090 LIR::Use arrObjUse(BlockRange(), &arrElem->gtArrObj, arrElem);
5091 ReplaceWithLclVar(arrObjUse);
5092 }
5093
5094 GenTree* arrObjNode = arrElem->gtArrObj;
5095 assert(arrObjNode->IsLocal());
5096
5097 LclVarDsc* const varDsc = &comp->lvaTable[arrElem->gtArrObj->AsLclVarCommon()->gtLclNum];
5098
5099 GenTree* insertionPoint = arrElem;
5100
5101 // The first ArrOffs node will have 0 for the offset of the previous dimension.
5102 GenTree* prevArrOffs = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_I_IMPL, 0);
5103 BlockRange().InsertBefore(insertionPoint, prevArrOffs);
5104 GenTree* nextToLower = prevArrOffs;
5105
5106 for (unsigned char dim = 0; dim < rank; dim++)
5107 {
5108 GenTree* indexNode = arrElem->gtArrElem.gtArrInds[dim];
5109
5110 // Use the original arrObjNode on the 0th ArrIndex node, and clone it for subsequent ones.
5111 GenTree* idxArrObjNode;
5112 if (dim == 0)
5113 {
5114 idxArrObjNode = arrObjNode;
5115 }
5116 else
5117 {
5118 idxArrObjNode = comp->gtClone(arrObjNode);
5119 BlockRange().InsertBefore(insertionPoint, idxArrObjNode);
5120 }
5121
5122 // Next comes the GT_ARR_INDEX node.
5123 GenTreeArrIndex* arrMDIdx = new (comp, GT_ARR_INDEX)
5124 GenTreeArrIndex(TYP_INT, idxArrObjNode, indexNode, dim, rank, arrElem->gtArrElem.gtArrElemType);
5125 arrMDIdx->gtFlags |= ((idxArrObjNode->gtFlags | indexNode->gtFlags) & GTF_ALL_EFFECT);
5126 BlockRange().InsertBefore(insertionPoint, arrMDIdx);
5127
5128 GenTree* offsArrObjNode = comp->gtClone(arrObjNode);
5129 BlockRange().InsertBefore(insertionPoint, offsArrObjNode);
5130
5131 GenTreeArrOffs* arrOffs =
5132 new (comp, GT_ARR_OFFSET) GenTreeArrOffs(TYP_I_IMPL, prevArrOffs, arrMDIdx, offsArrObjNode, dim, rank,
5133 arrElem->gtArrElem.gtArrElemType);
5134 arrOffs->gtFlags |= ((prevArrOffs->gtFlags | arrMDIdx->gtFlags | offsArrObjNode->gtFlags) & GTF_ALL_EFFECT);
5135 BlockRange().InsertBefore(insertionPoint, arrOffs);
5136
5137 prevArrOffs = arrOffs;
5138 }
5139
5140 // Generate the LEA and make it reverse evaluation, because we want to evaluate the index expression before the
5141 // base.
5142 unsigned scale = arrElem->gtArrElem.gtArrElemSize;
5143 unsigned offset = comp->eeGetMDArrayDataOffset(arrElem->gtArrElem.gtArrElemType, arrElem->gtArrElem.gtArrRank);
5144
5145 GenTree* leaIndexNode = prevArrOffs;
5146 if (!jitIsScaleIndexMul(scale))
5147 {
5148 // We do the address arithmetic in TYP_I_IMPL, though note that the lower bounds and lengths in memory are
5149 // TYP_INT
5150 GenTree* scaleNode = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_I_IMPL, scale);
5151 GenTree* mulNode = new (comp, GT_MUL) GenTreeOp(GT_MUL, TYP_I_IMPL, leaIndexNode, scaleNode);
5152 BlockRange().InsertBefore(insertionPoint, scaleNode, mulNode);
5153 leaIndexNode = mulNode;
5154 scale = 1;
5155 }
5156
5157 GenTree* leaBase = comp->gtClone(arrObjNode);
5158 BlockRange().InsertBefore(insertionPoint, leaBase);
5159
5160 GenTree* leaNode = new (comp, GT_LEA) GenTreeAddrMode(arrElem->TypeGet(), leaBase, leaIndexNode, scale, offset);
5161
5162 BlockRange().InsertBefore(insertionPoint, leaNode);
5163
5164 LIR::Use arrElemUse;
5165 if (BlockRange().TryGetUse(arrElem, &arrElemUse))
5166 {
5167 arrElemUse.ReplaceWith(comp, leaNode);
5168 }
5169 else
5170 {
5171 leaNode->SetUnusedValue();
5172 }
5173
5174 BlockRange().Remove(arrElem);
5175
5176 JITDUMP("Results of lowering ArrElem:\n");
5177 DISPTREERANGE(BlockRange(), leaNode);
5178 JITDUMP("\n\n");
5179
5180 return nextToLower;
5181}
5182
5183void Lowering::DoPhase()
5184{
5185 // If we have any PInvoke calls, insert the one-time prolog code. We'll inserted the epilog code in the
5186 // appropriate spots later. NOTE: there is a minor optimization opportunity here, as we still create p/invoke
5187 // data structures and setup/teardown even if we've eliminated all p/invoke calls due to dead code elimination.
5188 if (comp->info.compCallUnmanaged)
5189 {
5190 InsertPInvokeMethodProlog();
5191 }
5192
5193#if !defined(_TARGET_64BIT_)
5194 DecomposeLongs decomp(comp); // Initialize the long decomposition class.
5195 if (comp->compLongUsed)
5196 {
5197 decomp.PrepareForDecomposition();
5198 }
5199#endif // !defined(_TARGET_64BIT_)
5200
5201 for (BasicBlock* block = comp->fgFirstBB; block; block = block->bbNext)
5202 {
5203 /* Make the block publicly available */
5204 comp->compCurBB = block;
5205
5206#if !defined(_TARGET_64BIT_)
5207 if (comp->compLongUsed)
5208 {
5209 decomp.DecomposeBlock(block);
5210 }
5211#endif //!_TARGET_64BIT_
5212
5213 LowerBlock(block);
5214 }
5215
5216#ifdef DEBUG
5217 JITDUMP("Lower has completed modifying nodes.\n");
5218 if (VERBOSE)
5219 {
5220 comp->fgDispBasicBlocks(true);
5221 }
5222#endif
5223
5224 // Recompute local var ref counts before potentially sorting for liveness.
5225 // Note this does minimal work in cases where we are not going to sort.
5226 const bool isRecompute = true;
5227 const bool setSlotNumbers = false;
5228 comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);
5229
5230 comp->fgLocalVarLiveness();
5231 // local var liveness can delete code, which may create empty blocks
5232 if (comp->opts.OptimizationEnabled())
5233 {
5234 comp->optLoopsMarked = false;
5235 bool modified = comp->fgUpdateFlowGraph();
5236 if (modified)
5237 {
5238 JITDUMP("had to run another liveness pass:\n");
5239 comp->fgLocalVarLiveness();
5240 }
5241 }
5242
5243 // Recompute local var ref counts again after liveness to reflect
5244 // impact of any dead code removal. Note this may leave us with
5245 // tracked vars that have zero refs.
5246 comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);
5247
5248#ifdef DEBUG
5249 JITDUMP("Liveness pass finished after lowering, IR:\n");
5250 if (VERBOSE)
5251 {
5252 comp->fgDispBasicBlocks(true);
5253 }
5254
5255 for (BasicBlock* block = comp->fgFirstBB; block; block = block->bbNext)
5256 {
5257 assert(LIR::AsRange(block).CheckLIR(comp, true));
5258 }
5259#endif
5260}
5261
5262#ifdef DEBUG
5263
5264//------------------------------------------------------------------------
5265// Lowering::CheckCallArg: check that a call argument is in an expected
5266// form after lowering.
5267//
5268// Arguments:
5269// arg - the argument to check.
5270//
5271void Lowering::CheckCallArg(GenTree* arg)
5272{
5273 if (!arg->IsValue() && !arg->OperIsPutArgStk())
5274 {
5275 assert((arg->OperIsStore() && !arg->IsValue()) || arg->IsArgPlaceHolderNode() || arg->IsNothingNode() ||
5276 arg->OperIsCopyBlkOp());
5277 return;
5278 }
5279
5280 switch (arg->OperGet())
5281 {
5282 case GT_FIELD_LIST:
5283 {
5284 GenTreeFieldList* list = arg->AsFieldList();
5285 assert(list->isContained());
5286 assert(list->IsFieldListHead());
5287
5288 for (; list != nullptr; list = list->Rest())
5289 {
5290 assert(list->Current()->OperIsPutArg());
5291 }
5292 }
5293 break;
5294
5295 default:
5296 assert(arg->OperIsPutArg());
5297 break;
5298 }
5299}
5300
5301//------------------------------------------------------------------------
5302// Lowering::CheckCall: check that a call is in an expected form after
5303// lowering. Currently this amounts to checking its
5304// arguments, but could be expanded to verify more
5305// properties in the future.
5306//
5307// Arguments:
5308// call - the call to check.
5309//
5310void Lowering::CheckCall(GenTreeCall* call)
5311{
5312 if (call->gtCallObjp != nullptr)
5313 {
5314 CheckCallArg(call->gtCallObjp);
5315 }
5316
5317 for (GenTreeArgList* args = call->gtCallArgs; args != nullptr; args = args->Rest())
5318 {
5319 CheckCallArg(args->Current());
5320 }
5321
5322 for (GenTreeArgList* args = call->gtCallLateArgs; args != nullptr; args = args->Rest())
5323 {
5324 CheckCallArg(args->Current());
5325 }
5326}
5327
5328//------------------------------------------------------------------------
5329// Lowering::CheckNode: check that an LIR node is in an expected form
5330// after lowering.
5331//
5332// Arguments:
5333// compiler - the compiler context.
5334// node - the node to check.
5335//
5336void Lowering::CheckNode(Compiler* compiler, GenTree* node)
5337{
5338 switch (node->OperGet())
5339 {
5340 case GT_CALL:
5341 CheckCall(node->AsCall());
5342 break;
5343
5344#ifdef FEATURE_SIMD
5345 case GT_SIMD:
5346 assert(node->TypeGet() != TYP_SIMD12);
5347 break;
5348#ifdef _TARGET_64BIT_
5349 case GT_LCL_VAR:
5350 case GT_STORE_LCL_VAR:
5351 {
5352 unsigned lclNum = node->AsLclVarCommon()->GetLclNum();
5353 LclVarDsc* lclVar = &compiler->lvaTable[lclNum];
5354 assert(node->TypeGet() != TYP_SIMD12 || compiler->lvaIsFieldOfDependentlyPromotedStruct(lclVar));
5355 }
5356 break;
5357#endif // _TARGET_64BIT_
5358#endif // SIMD
5359
5360 default:
5361 break;
5362 }
5363}
5364
5365//------------------------------------------------------------------------
5366// Lowering::CheckBlock: check that the contents of an LIR block are in an
5367// expected form after lowering.
5368//
5369// Arguments:
5370// compiler - the compiler context.
5371// block - the block to check.
5372//
5373bool Lowering::CheckBlock(Compiler* compiler, BasicBlock* block)
5374{
5375 assert(block->isEmpty() || block->IsLIR());
5376
5377 LIR::Range& blockRange = LIR::AsRange(block);
5378 for (GenTree* node : blockRange)
5379 {
5380 CheckNode(compiler, node);
5381 }
5382
5383 assert(blockRange.CheckLIR(compiler, true));
5384 return true;
5385}
5386#endif
5387
5388void Lowering::LowerBlock(BasicBlock* block)
5389{
5390 assert(block == comp->compCurBB); // compCurBB must already be set.
5391 assert(block->isEmpty() || block->IsLIR());
5392
5393 m_block = block;
5394
5395 // NOTE: some of the lowering methods insert calls before the node being
5396 // lowered (See e.g. InsertPInvoke{Method,Call}{Prolog,Epilog}). In
5397 // general, any code that is inserted before the current node should be
5398 // "pre-lowered" as they won't be subject to further processing.
5399 // Lowering::CheckBlock() runs some extra checks on call arguments in
5400 // order to help catch unlowered nodes.
5401
5402 GenTree* node = BlockRange().FirstNode();
5403 while (node != nullptr)
5404 {
5405 node = LowerNode(node);
5406 }
5407
5408 assert(CheckBlock(comp, block));
5409}
5410
5411/** Verifies if both of these trees represent the same indirection.
5412 * Used by Lower to annotate if CodeGen generate an instruction of the
5413 * form *addrMode BinOp= expr
5414 *
5415 * Preconditions: both trees are children of GT_INDs and their underlying children
5416 * have the same gtOper.
5417 *
5418 * This is a first iteration to actually recognize trees that can be code-generated
5419 * as a single read-modify-write instruction on AMD64/x86. For now
5420 * this method only supports the recognition of simple addressing modes (through GT_LEA)
5421 * or local var indirections. Local fields, array access and other more complex nodes are
5422 * not yet supported.
5423 *
5424 * TODO-CQ: Perform tree recognition by using the Value Numbering Package, that way we can recognize
5425 * arbitrary complex trees and support much more addressing patterns.
5426 */
5427bool Lowering::IndirsAreEquivalent(GenTree* candidate, GenTree* storeInd)
5428{
5429 assert(candidate->OperGet() == GT_IND);
5430 assert(storeInd->OperGet() == GT_STOREIND);
5431
5432 // We should check the size of the indirections. If they are
5433 // different, say because of a cast, then we can't call them equivalent. Doing so could cause us
5434 // to drop a cast.
5435 // Signed-ness difference is okay and expected since a store indirection must always
5436 // be signed based on the CIL spec, but a load could be unsigned.
5437 if (genTypeSize(candidate->gtType) != genTypeSize(storeInd->gtType))
5438 {
5439 return false;
5440 }
5441
5442 GenTree* pTreeA = candidate->gtGetOp1();
5443 GenTree* pTreeB = storeInd->gtGetOp1();
5444
5445 // This method will be called by codegen (as well as during lowering).
5446 // After register allocation, the sources may have been spilled and reloaded
5447 // to a different register, indicated by an inserted GT_RELOAD node.
5448 pTreeA = pTreeA->gtSkipReloadOrCopy();
5449 pTreeB = pTreeB->gtSkipReloadOrCopy();
5450
5451 genTreeOps oper;
5452
5453 if (pTreeA->OperGet() != pTreeB->OperGet())
5454 {
5455 return false;
5456 }
5457
5458 oper = pTreeA->OperGet();
5459 switch (oper)
5460 {
5461 case GT_LCL_VAR:
5462 case GT_LCL_VAR_ADDR:
5463 case GT_CLS_VAR_ADDR:
5464 case GT_CNS_INT:
5465 return NodesAreEquivalentLeaves(pTreeA, pTreeB);
5466
5467 case GT_LEA:
5468 {
5469 GenTreeAddrMode* gtAddr1 = pTreeA->AsAddrMode();
5470 GenTreeAddrMode* gtAddr2 = pTreeB->AsAddrMode();
5471 return NodesAreEquivalentLeaves(gtAddr1->Base(), gtAddr2->Base()) &&
5472 NodesAreEquivalentLeaves(gtAddr1->Index(), gtAddr2->Index()) &&
5473 (gtAddr1->gtScale == gtAddr2->gtScale) && (gtAddr1->Offset() == gtAddr2->Offset());
5474 }
5475 default:
5476 // We don't handle anything that is not either a constant,
5477 // a local var or LEA.
5478 return false;
5479 }
5480}
5481
5482/** Test whether the two given nodes are the same leaves.
5483 * Right now, only constant integers and local variables are supported
5484 */
5485bool Lowering::NodesAreEquivalentLeaves(GenTree* tree1, GenTree* tree2)
5486{
5487 if (tree1 == nullptr && tree2 == nullptr)
5488 {
5489 return true;
5490 }
5491
5492 // both null, they are equivalent, otherwise if either is null not equivalent
5493 if (tree1 == nullptr || tree2 == nullptr)
5494 {
5495 return false;
5496 }
5497
5498 tree1 = tree1->gtSkipReloadOrCopy();
5499 tree2 = tree2->gtSkipReloadOrCopy();
5500
5501 if (tree1->TypeGet() != tree2->TypeGet())
5502 {
5503 return false;
5504 }
5505
5506 if (tree1->OperGet() != tree2->OperGet())
5507 {
5508 return false;
5509 }
5510
5511 if (!tree1->OperIsLeaf() || !tree2->OperIsLeaf())
5512 {
5513 return false;
5514 }
5515
5516 switch (tree1->OperGet())
5517 {
5518 case GT_CNS_INT:
5519 return tree1->gtIntCon.gtIconVal == tree2->gtIntCon.gtIconVal &&
5520 tree1->IsIconHandle() == tree2->IsIconHandle();
5521 case GT_LCL_VAR:
5522 case GT_LCL_VAR_ADDR:
5523 return tree1->gtLclVarCommon.gtLclNum == tree2->gtLclVarCommon.gtLclNum;
5524 case GT_CLS_VAR_ADDR:
5525 return tree1->gtClsVar.gtClsVarHnd == tree2->gtClsVar.gtClsVarHnd;
5526 default:
5527 return false;
5528 }
5529}
5530
5531//------------------------------------------------------------------------
5532// Containment Analysis
5533//------------------------------------------------------------------------
5534void Lowering::ContainCheckNode(GenTree* node)
5535{
5536 switch (node->gtOper)
5537 {
5538 case GT_STORE_LCL_VAR:
5539 case GT_STORE_LCL_FLD:
5540 ContainCheckStoreLoc(node->AsLclVarCommon());
5541 break;
5542
5543 case GT_EQ:
5544 case GT_NE:
5545 case GT_LT:
5546 case GT_LE:
5547 case GT_GE:
5548 case GT_GT:
5549 case GT_TEST_EQ:
5550 case GT_TEST_NE:
5551 case GT_CMP:
5552 case GT_JCMP:
5553 ContainCheckCompare(node->AsOp());
5554 break;
5555
5556 case GT_JTRUE:
5557 ContainCheckJTrue(node->AsOp());
5558 break;
5559
5560 case GT_ADD:
5561 case GT_SUB:
5562#if !defined(_TARGET_64BIT_)
5563 case GT_ADD_LO:
5564 case GT_ADD_HI:
5565 case GT_SUB_LO:
5566 case GT_SUB_HI:
5567#endif
5568 case GT_AND:
5569 case GT_OR:
5570 case GT_XOR:
5571 ContainCheckBinary(node->AsOp());
5572 break;
5573
5574#if defined(_TARGET_X86_)
5575 case GT_MUL_LONG:
5576#endif
5577 case GT_MUL:
5578 case GT_MULHI:
5579 ContainCheckMul(node->AsOp());
5580 break;
5581 case GT_DIV:
5582 case GT_MOD:
5583 case GT_UDIV:
5584 case GT_UMOD:
5585 ContainCheckDivOrMod(node->AsOp());
5586 break;
5587 case GT_LSH:
5588 case GT_RSH:
5589 case GT_RSZ:
5590 case GT_ROL:
5591 case GT_ROR:
5592#ifndef _TARGET_64BIT_
5593 case GT_LSH_HI:
5594 case GT_RSH_LO:
5595#endif
5596 ContainCheckShiftRotate(node->AsOp());
5597 break;
5598 case GT_ARR_OFFSET:
5599 ContainCheckArrOffset(node->AsArrOffs());
5600 break;
5601 case GT_LCLHEAP:
5602 ContainCheckLclHeap(node->AsOp());
5603 break;
5604 case GT_RETURN:
5605 ContainCheckRet(node->AsOp());
5606 break;
5607 case GT_RETURNTRAP:
5608 ContainCheckReturnTrap(node->AsOp());
5609 break;
5610 case GT_STOREIND:
5611 ContainCheckStoreIndir(node->AsIndir());
5612 case GT_IND:
5613 ContainCheckIndir(node->AsIndir());
5614 break;
5615 case GT_PUTARG_REG:
5616 case GT_PUTARG_STK:
5617#if FEATURE_ARG_SPLIT
5618 case GT_PUTARG_SPLIT:
5619#endif // FEATURE_ARG_SPLIT
5620 // The regNum must have been set by the lowering of the call.
5621 assert(node->gtRegNum != REG_NA);
5622 break;
5623#ifdef _TARGET_XARCH_
5624 case GT_INTRINSIC:
5625 ContainCheckIntrinsic(node->AsOp());
5626 break;
5627#endif // _TARGET_XARCH_
5628#ifdef FEATURE_SIMD
5629 case GT_SIMD:
5630 ContainCheckSIMD(node->AsSIMD());
5631 break;
5632#endif // FEATURE_SIMD
5633#ifdef FEATURE_HW_INTRINSICS
5634 case GT_HWIntrinsic:
5635 ContainCheckHWIntrinsic(node->AsHWIntrinsic());
5636 break;
5637#endif // FEATURE_HW_INTRINSICS
5638 default:
5639 break;
5640 }
5641}
5642
5643//------------------------------------------------------------------------
5644// ContainCheckReturnTrap: determine whether the source of a RETURNTRAP should be contained.
5645//
5646// Arguments:
5647// node - pointer to the GT_RETURNTRAP node
5648//
5649void Lowering::ContainCheckReturnTrap(GenTreeOp* node)
5650{
5651#ifdef _TARGET_XARCH_
5652 assert(node->OperIs(GT_RETURNTRAP));
5653 // This just turns into a compare of its child with an int + a conditional call
5654 if (node->gtOp1->isIndir())
5655 {
5656 MakeSrcContained(node, node->gtOp1);
5657 }
5658#endif // _TARGET_XARCH_
5659}
5660
5661//------------------------------------------------------------------------
5662// ContainCheckArrOffset: determine whether the source of an ARR_OFFSET should be contained.
5663//
5664// Arguments:
5665// node - pointer to the GT_ARR_OFFSET node
5666//
5667void Lowering::ContainCheckArrOffset(GenTreeArrOffs* node)
5668{
5669 assert(node->OperIs(GT_ARR_OFFSET));
5670 // we don't want to generate code for this
5671 if (node->gtOffset->IsIntegralConst(0))
5672 {
5673 MakeSrcContained(node, node->gtArrOffs.gtOffset);
5674 }
5675}
5676
5677//------------------------------------------------------------------------
5678// ContainCheckLclHeap: determine whether the source of a GT_LCLHEAP node should be contained.
5679//
5680// Arguments:
5681// node - pointer to the node
5682//
5683void Lowering::ContainCheckLclHeap(GenTreeOp* node)
5684{
5685 assert(node->OperIs(GT_LCLHEAP));
5686 GenTree* size = node->gtOp.gtOp1;
5687 if (size->IsCnsIntOrI())
5688 {
5689 MakeSrcContained(node, size);
5690 }
5691}
5692
5693//------------------------------------------------------------------------
5694// ContainCheckRet: determine whether the source of a node should be contained.
5695//
5696// Arguments:
5697// node - pointer to the node
5698//
5699void Lowering::ContainCheckRet(GenTreeOp* ret)
5700{
5701 assert(ret->OperIs(GT_RETURN));
5702
5703#if !defined(_TARGET_64BIT_)
5704 if (ret->TypeGet() == TYP_LONG)
5705 {
5706 GenTree* op1 = ret->gtGetOp1();
5707 noway_assert(op1->OperGet() == GT_LONG);
5708 MakeSrcContained(ret, op1);
5709 }
5710#endif // !defined(_TARGET_64BIT_)
5711#if FEATURE_MULTIREG_RET
5712 if (varTypeIsStruct(ret))
5713 {
5714 GenTree* op1 = ret->gtGetOp1();
5715 // op1 must be either a lclvar or a multi-reg returning call
5716 if (op1->OperGet() == GT_LCL_VAR)
5717 {
5718 GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
5719 LclVarDsc* varDsc = &(comp->lvaTable[lclVarCommon->gtLclNum]);
5720 assert(varDsc->lvIsMultiRegRet);
5721
5722 // Mark var as contained if not enregistrable.
5723 if (!varTypeIsEnregisterableStruct(op1))
5724 {
5725 MakeSrcContained(ret, op1);
5726 }
5727 }
5728 }
5729#endif // FEATURE_MULTIREG_RET
5730}
5731
5732//------------------------------------------------------------------------
5733// ContainCheckJTrue: determine whether the source of a JTRUE should be contained.
5734//
5735// Arguments:
5736// node - pointer to the node
5737//
5738void Lowering::ContainCheckJTrue(GenTreeOp* node)
5739{
5740 // The compare does not need to be generated into a register.
5741 GenTree* cmp = node->gtGetOp1();
5742 cmp->gtType = TYP_VOID;
5743 cmp->gtFlags |= GTF_SET_FLAGS;
5744}
5745