1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Register Requirements for AMD64 XX
9XX XX
10XX This encapsulates all the logic for setting register requirements for XX
11XX the AMD64 architecture. XX
12XX XX
13XX XX
14XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16*/
17
18#include "jitpch.h"
19#ifdef _MSC_VER
20#pragma hdrstop
21#endif
22
23#ifdef _TARGET_XARCH_
24
25#include "jit.h"
26#include "sideeffects.h"
27#include "lower.h"
28
29//------------------------------------------------------------------------
30// BuildNode: Build the RefPositions for for a node
31//
32// Arguments:
33// treeNode - the node of interest
34//
35// Return Value:
36// The number of sources consumed by this node.
37//
38// Notes:
39// Preconditions:
40// LSRA Has been initialized.
41//
42// Postconditions:
43// RefPositions have been built for all the register defs and uses required
44// for this node.
45//
46int LinearScan::BuildNode(GenTree* tree)
47{
48 assert(!tree->isContained());
49 Interval* prefSrcInterval = nullptr;
50 int srcCount;
51 int dstCount = 0;
52 regMaskTP dstCandidates = RBM_NONE;
53 regMaskTP killMask = RBM_NONE;
54 bool isLocalDefUse = false;
55
56 // Reset the build-related members of LinearScan.
57 clearBuildState();
58
59 // Set the default dstCount. This may be modified below.
60 if (tree->IsValue())
61 {
62 dstCount = 1;
63 if (tree->IsUnusedValue())
64 {
65 isLocalDefUse = true;
66 }
67 }
68 else
69 {
70 dstCount = 0;
71 }
72
73 // floating type generates AVX instruction (vmovss etc.), set the flag
74 SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
75
76 switch (tree->OperGet())
77 {
78 default:
79 srcCount = BuildSimple(tree);
80 break;
81
82 case GT_LCL_VAR:
83 // Because we do containment analysis before we redo dataflow and identify register
84 // candidates, the containment analysis only uses !lvDoNotEnregister to estimate register
85 // candidates.
86 // If there is a lclVar that is estimated to be register candidate but
87 // is not, if they were marked regOptional they should now be marked contained instead.
88 // TODO-XArch-CQ: When this is being called while RefPositions are being created,
89 // use lvLRACandidate here instead.
90 if (tree->IsRegOptional())
91 {
92 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
93 compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
94 {
95 tree->ClearRegOptional();
96 tree->SetContained();
97 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 0));
98 return 0;
99 }
100 }
101 __fallthrough;
102
103 case GT_LCL_FLD:
104 {
105 // We handle tracked variables differently from non-tracked ones. If it is tracked,
106 // we will simply add a use of the tracked variable at its parent/consumer.
107 // Otherwise, for a use we need to actually add the appropriate references for loading
108 // or storing the variable.
109 //
110 // A tracked variable won't actually get used until the appropriate ancestor tree node
111 // is processed, unless this is marked "isLocalDefUse" because it is a stack-based argument
112 // to a call or an orphaned dead node.
113 //
114 LclVarDsc* const varDsc = &compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum];
115 if (isCandidateVar(varDsc))
116 {
117 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 1));
118 return 0;
119 }
120 srcCount = 0;
121#ifdef FEATURE_SIMD
122 // Need an additional register to read upper 4 bytes of Vector3.
123 if (tree->TypeGet() == TYP_SIMD12)
124 {
125 // We need an internal register different from targetReg in which 'tree' produces its result
126 // because both targetReg and internal reg will be in use at the same time.
127 buildInternalFloatRegisterDefForNode(tree, allSIMDRegs());
128 setInternalRegsDelayFree = true;
129 buildInternalRegisterUses();
130 }
131#endif
132 BuildDef(tree);
133 }
134 break;
135
136 case GT_STORE_LCL_FLD:
137 case GT_STORE_LCL_VAR:
138 srcCount = BuildStoreLoc(tree->AsLclVarCommon());
139 break;
140
141 case GT_FIELD_LIST:
142 // These should always be contained. We don't correctly allocate or
143 // generate code for a non-contained GT_FIELD_LIST.
144 noway_assert(!"Non-contained GT_FIELD_LIST");
145 srcCount = 0;
146 break;
147
148 case GT_LIST:
149 case GT_ARGPLACE:
150 case GT_NO_OP:
151 case GT_START_NONGC:
152 srcCount = 0;
153 assert(dstCount == 0);
154 break;
155
156 case GT_PROF_HOOK:
157 srcCount = 0;
158 assert(dstCount == 0);
159 killMask = getKillSetForProfilerHook();
160 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
161 break;
162
163 case GT_CNS_INT:
164 case GT_CNS_LNG:
165 case GT_CNS_DBL:
166 {
167 srcCount = 0;
168 assert(dstCount == 1);
169 assert(!tree->IsReuseRegVal());
170 RefPosition* def = BuildDef(tree);
171 def->getInterval()->isConstant = true;
172 }
173 break;
174
175#if !defined(_TARGET_64BIT_)
176
177 case GT_LONG:
178 assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
179 // An unused GT_LONG node needs to consume its sources, but need not produce a register.
180 tree->gtType = TYP_VOID;
181 tree->ClearUnusedValue();
182 isLocalDefUse = false;
183 srcCount = 2;
184 dstCount = 0;
185 BuildUse(tree->gtGetOp1());
186 BuildUse(tree->gtGetOp2());
187 break;
188
189#endif // !defined(_TARGET_64BIT_)
190
191 case GT_BOX:
192 case GT_COMMA:
193 case GT_QMARK:
194 case GT_COLON:
195 srcCount = 0;
196 unreached();
197 break;
198
199 case GT_RETURN:
200 srcCount = BuildReturn(tree);
201 killMask = getKillSetForReturn();
202 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
203 break;
204
205 case GT_RETFILT:
206 assert(dstCount == 0);
207 if (tree->TypeGet() == TYP_VOID)
208 {
209 srcCount = 0;
210 }
211 else
212 {
213 assert(tree->TypeGet() == TYP_INT);
214 srcCount = 1;
215 BuildUse(tree->gtGetOp1(), RBM_INTRET);
216 }
217 break;
218
219 // A GT_NOP is either a passthrough (if it is void, or if it has
220 // a child), but must be considered to produce a dummy value if it
221 // has a type but no child
222 case GT_NOP:
223 srcCount = 0;
224 assert((tree->gtGetOp1() == nullptr) || tree->isContained());
225 if (tree->TypeGet() != TYP_VOID && tree->gtGetOp1() == nullptr)
226 {
227 assert(dstCount == 1);
228 BuildUse(tree->gtGetOp1());
229 BuildDef(tree);
230 }
231 else
232 {
233 assert(dstCount == 0);
234 }
235 break;
236
237 case GT_JTRUE:
238 {
239 srcCount = 0;
240 assert(dstCount == 0);
241 GenTree* cmp = tree->gtGetOp1();
242 assert(!cmp->IsValue());
243 }
244 break;
245
246 case GT_JCC:
247 srcCount = 0;
248 assert(dstCount == 0);
249 break;
250
251 case GT_SETCC:
252 srcCount = 0;
253 assert(dstCount == 1);
254 // This defines a byte value (note that on x64 allByteRegs() is defined as RBM_ALLINT).
255 BuildDef(tree, allByteRegs());
256 break;
257
258 case GT_JMP:
259 srcCount = 0;
260 assert(dstCount == 0);
261 break;
262
263 case GT_SWITCH:
264 // This should never occur since switch nodes must not be visible at this
265 // point in the JIT.
266 srcCount = 0;
267 noway_assert(!"Switch must be lowered at this point");
268 break;
269
270 case GT_JMPTABLE:
271 srcCount = 0;
272 assert(dstCount == 1);
273 BuildDef(tree);
274 break;
275
276 case GT_SWITCH_TABLE:
277 {
278 assert(dstCount == 0);
279 buildInternalIntRegisterDefForNode(tree);
280 srcCount = BuildBinaryUses(tree->AsOp());
281 buildInternalRegisterUses();
282 assert(srcCount == 2);
283 }
284 break;
285
286 case GT_ASG:
287 noway_assert(!"We should never hit any assignment operator in lowering");
288 srcCount = 0;
289 break;
290
291#if !defined(_TARGET_64BIT_)
292 case GT_ADD_LO:
293 case GT_ADD_HI:
294 case GT_SUB_LO:
295 case GT_SUB_HI:
296#endif
297 case GT_ADD:
298 case GT_SUB:
299 case GT_AND:
300 case GT_OR:
301 case GT_XOR:
302 srcCount = BuildBinaryUses(tree->AsOp());
303 assert(dstCount == 1);
304 BuildDef(tree);
305 break;
306
307 case GT_BT:
308 srcCount = BuildBinaryUses(tree->AsOp());
309 assert(dstCount == 0);
310 break;
311
312 case GT_RETURNTRAP:
313 {
314 // This just turns into a compare of its child with an int + a conditional call.
315 RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
316 srcCount = BuildOperandUses(tree->gtGetOp1());
317 buildInternalRegisterUses();
318 killMask = compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC);
319 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
320 }
321 break;
322
323 case GT_MOD:
324 case GT_DIV:
325 case GT_UMOD:
326 case GT_UDIV:
327 srcCount = BuildModDiv(tree->AsOp());
328 break;
329
330#if defined(_TARGET_X86_)
331 case GT_MUL_LONG:
332 dstCount = 2;
333 __fallthrough;
334#endif
335 case GT_MUL:
336 case GT_MULHI:
337 srcCount = BuildMul(tree->AsOp());
338 break;
339
340 case GT_INTRINSIC:
341 srcCount = BuildIntrinsic(tree->AsOp());
342 break;
343
344#ifdef FEATURE_SIMD
345 case GT_SIMD:
346 srcCount = BuildSIMD(tree->AsSIMD());
347 break;
348#endif // FEATURE_SIMD
349
350#ifdef FEATURE_HW_INTRINSICS
351 case GT_HWIntrinsic:
352 srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic());
353 break;
354#endif // FEATURE_HW_INTRINSICS
355
356 case GT_CAST:
357 assert(dstCount == 1);
358 srcCount = BuildCast(tree->AsCast());
359 break;
360
361 case GT_BITCAST:
362 {
363 assert(dstCount == 1);
364 tgtPrefUse = BuildUse(tree->gtGetOp1());
365 BuildDef(tree);
366 srcCount = 1;
367 }
368 break;
369
370 case GT_NEG:
371 // TODO-XArch-CQ:
372 // SSE instruction set doesn't have an instruction to negate a number.
373 // The recommended way is to xor the float/double number with a bitmask.
374 // The only way to xor is using xorps or xorpd both of which operate on
375 // 128-bit operands. To hold the bit-mask we would need another xmm
376 // register or a 16-byte aligned 128-bit data constant. Right now emitter
377 // lacks the support for emitting such constants or instruction with mem
378 // addressing mode referring to a 128-bit operand. For now we use an
379 // internal xmm register to load 32/64-bit bitmask from data section.
380 // Note that by trading additional data section memory (128-bit) we can
381 // save on the need for an internal register and also a memory-to-reg
382 // move.
383 //
384 // Note: another option to avoid internal register requirement is by
385 // lowering as GT_SUB(0, src). This will generate code different from
386 // Jit64 and could possibly result in compat issues (?).
387 if (varTypeIsFloating(tree))
388 {
389
390 RefPosition* internalDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
391 srcCount = BuildOperandUses(tree->gtGetOp1());
392 buildInternalRegisterUses();
393 }
394 else
395 {
396 srcCount = BuildOperandUses(tree->gtGetOp1());
397 }
398 BuildDef(tree);
399 break;
400
401 case GT_NOT:
402 srcCount = BuildOperandUses(tree->gtGetOp1());
403 BuildDef(tree);
404 break;
405
406 case GT_LSH:
407 case GT_RSH:
408 case GT_RSZ:
409 case GT_ROL:
410 case GT_ROR:
411#ifdef _TARGET_X86_
412 case GT_LSH_HI:
413 case GT_RSH_LO:
414#endif
415 srcCount = BuildShiftRotate(tree);
416 break;
417
418 case GT_EQ:
419 case GT_NE:
420 case GT_LT:
421 case GT_LE:
422 case GT_GE:
423 case GT_GT:
424 case GT_TEST_EQ:
425 case GT_TEST_NE:
426 case GT_CMP:
427 srcCount = BuildCmp(tree);
428 break;
429
430 case GT_CKFINITE:
431 {
432 assert(dstCount == 1);
433 RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
434 srcCount = BuildOperandUses(tree->gtGetOp1());
435 buildInternalRegisterUses();
436 BuildDef(tree);
437 }
438 break;
439
440 case GT_CMPXCHG:
441 {
442 srcCount = 3;
443 assert(dstCount == 1);
444
445 // Comparand is preferenced to RAX.
446 // The remaining two operands can be in any reg other than RAX.
447 BuildUse(tree->gtCmpXchg.gtOpLocation, allRegs(TYP_INT) & ~RBM_RAX);
448 BuildUse(tree->gtCmpXchg.gtOpValue, allRegs(TYP_INT) & ~RBM_RAX);
449 BuildUse(tree->gtCmpXchg.gtOpComparand, RBM_RAX);
450 BuildDef(tree, RBM_RAX);
451 }
452 break;
453
454 case GT_XADD:
455 case GT_XCHG:
456 {
457 // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
458 // to special case them.
459 // These tree nodes will have their op1 marked as isDelayFree=true.
460 // That is, op1's reg remains in use until the subsequent instruction.
461 GenTree* addr = tree->gtGetOp1();
462 GenTree* data = tree->gtGetOp2();
463 assert(!addr->isContained());
464 RefPosition* addrUse = BuildUse(addr);
465 setDelayFree(addrUse);
466 tgtPrefUse = addrUse;
467 assert(!data->isContained());
468 BuildUse(data);
469 srcCount = 2;
470 assert(dstCount == 1);
471 BuildDef(tree);
472 }
473 break;
474
475 case GT_PUTARG_REG:
476 srcCount = BuildPutArgReg(tree->AsUnOp());
477 break;
478
479 case GT_CALL:
480 srcCount = BuildCall(tree->AsCall());
481 if (tree->AsCall()->HasMultiRegRetVal())
482 {
483 dstCount = tree->AsCall()->GetReturnTypeDesc()->GetReturnRegCount();
484 }
485 break;
486
487 case GT_ADDR:
488 {
489 // For a GT_ADDR, the child node should not be evaluated into a register
490 GenTree* child = tree->gtGetOp1();
491 assert(!isCandidateLocalRef(child));
492 assert(child->isContained());
493 assert(dstCount == 1);
494 srcCount = 0;
495 }
496 break;
497
498#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
499 case GT_OBJ:
500#endif
501 case GT_BLK:
502 case GT_DYN_BLK:
503 // These should all be eliminated prior to Lowering.
504 assert(!"Non-store block node in Lowering");
505 srcCount = 0;
506 break;
507
508#ifdef FEATURE_PUT_STRUCT_ARG_STK
509 case GT_PUTARG_STK:
510 srcCount = BuildPutArgStk(tree->AsPutArgStk());
511 break;
512#endif // FEATURE_PUT_STRUCT_ARG_STK
513
514 case GT_STORE_BLK:
515 case GT_STORE_OBJ:
516 case GT_STORE_DYN_BLK:
517 srcCount = BuildBlockStore(tree->AsBlk());
518 break;
519
520 case GT_INIT_VAL:
521 // Always a passthrough of its child's value.
522 assert(!"INIT_VAL should always be contained");
523 srcCount = 0;
524 break;
525
526 case GT_LCLHEAP:
527 srcCount = BuildLclHeap(tree);
528 break;
529
530 case GT_ARR_BOUNDS_CHECK:
531#ifdef FEATURE_SIMD
532 case GT_SIMD_CHK:
533#endif // FEATURE_SIMD
534#ifdef FEATURE_HW_INTRINSICS
535 case GT_HW_INTRINSIC_CHK:
536#endif // FEATURE_HW_INTRINSICS
537
538 // Consumes arrLen & index - has no result
539 srcCount = 2;
540 assert(dstCount == 0);
541 srcCount = BuildOperandUses(tree->AsBoundsChk()->gtIndex);
542 srcCount += BuildOperandUses(tree->AsBoundsChk()->gtArrLen);
543 break;
544
545 case GT_ARR_ELEM:
546 // These must have been lowered to GT_ARR_INDEX
547 noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
548 srcCount = 0;
549 break;
550
551 case GT_ARR_INDEX:
552 {
553 srcCount = 2;
554 assert(dstCount == 1);
555 assert(!tree->AsArrIndex()->ArrObj()->isContained());
556 assert(!tree->AsArrIndex()->IndexExpr()->isContained());
557 // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
558 // times while the result is being computed.
559 RefPosition* arrObjUse = BuildUse(tree->AsArrIndex()->ArrObj());
560 setDelayFree(arrObjUse);
561 BuildUse(tree->AsArrIndex()->IndexExpr());
562 BuildDef(tree);
563 }
564 break;
565
566 case GT_ARR_OFFSET:
567 {
568 // This consumes the offset, if any, the arrObj and the effective index,
569 // and produces the flattened offset for this dimension.
570 assert(dstCount == 1);
571 srcCount = 0;
572 RefPosition* internalDef = nullptr;
573 if (tree->gtArrOffs.gtOffset->isContained())
574 {
575 srcCount = 2;
576 }
577 else
578 {
579 // Here we simply need an internal register, which must be different
580 // from any of the operand's registers, but may be the same as targetReg.
581 srcCount = 3;
582 internalDef = buildInternalIntRegisterDefForNode(tree);
583 BuildUse(tree->AsArrOffs()->gtOffset);
584 }
585 BuildUse(tree->AsArrOffs()->gtIndex);
586 BuildUse(tree->AsArrOffs()->gtArrObj);
587 if (internalDef != nullptr)
588 {
589 buildInternalRegisterUses();
590 }
591 BuildDef(tree);
592 }
593 break;
594
595 case GT_LEA:
596 // The LEA usually passes its operands through to the GT_IND, in which case it will
597 // be contained, but we may be instantiating an address, in which case we set them here.
598 srcCount = 0;
599 assert(dstCount == 1);
600 if (tree->AsAddrMode()->HasBase())
601 {
602 srcCount++;
603 BuildUse(tree->AsAddrMode()->Base());
604 }
605 if (tree->AsAddrMode()->HasIndex())
606 {
607 srcCount++;
608 BuildUse(tree->AsAddrMode()->Index());
609 }
610 BuildDef(tree);
611 break;
612
613 case GT_STOREIND:
614 if (compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(tree))
615 {
616 srcCount = BuildGCWriteBarrier(tree);
617 break;
618 }
619 srcCount = BuildIndir(tree->AsIndir());
620 break;
621
622 case GT_NULLCHECK:
623 {
624 assert(dstCount == 0);
625 regMaskTP indirCandidates = RBM_NONE;
626 BuildUse(tree->gtGetOp1(), indirCandidates);
627 srcCount = 1;
628 break;
629 }
630
631 case GT_IND:
632 srcCount = BuildIndir(tree->AsIndir());
633 assert(dstCount == 1);
634 break;
635
636 case GT_CATCH_ARG:
637 srcCount = 0;
638 assert(dstCount == 1);
639 BuildDef(tree, RBM_EXCEPTION_OBJECT);
640 break;
641
642#if !FEATURE_EH_FUNCLETS
643 case GT_END_LFIN:
644 srcCount = 0;
645 assert(dstCount == 0);
646 break;
647#endif
648
649 case GT_CLS_VAR:
650 // These nodes are eliminated by rationalizer.
651 JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
652 unreached();
653 break;
654
655 case GT_INDEX_ADDR:
656 {
657 assert(dstCount == 1);
658 RefPosition* internalDef = nullptr;
659 if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
660 {
661 internalDef = buildInternalIntRegisterDefForNode(tree);
662 }
663 else
664 {
665 switch (tree->AsIndexAddr()->gtElemSize)
666 {
667 case 1:
668 case 2:
669 case 4:
670 case 8:
671 break;
672
673 default:
674 internalDef = buildInternalIntRegisterDefForNode(tree);
675 break;
676 }
677 }
678 srcCount = BuildBinaryUses(tree->AsOp());
679 if (internalDef != nullptr)
680 {
681 buildInternalRegisterUses();
682 }
683 BuildDef(tree);
684 }
685 break;
686
687 } // end switch (tree->OperGet())
688
689 // We need to be sure that we've set srcCount and dstCount appropriately.
690 // Not that for XARCH, the maximum number of registers defined is 2.
691 assert((dstCount < 2) || ((dstCount == 2) && tree->IsMultiRegNode()));
692 assert(isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
693 assert(!tree->IsUnusedValue() || (dstCount != 0));
694 assert(dstCount == tree->GetRegisterDstCount());
695 INDEBUG(dumpNodeInfo(tree, dstCandidates, srcCount, dstCount));
696 return srcCount;
697}
698
699GenTree* LinearScan::getTgtPrefOperand(GenTreeOp* tree)
700{
701 // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
702 // Even then we would like to set isTgtPref on Op1.
703 if (tree->OperIsBinary() && isRMWRegOper(tree))
704 {
705 GenTree* op1 = tree->gtGetOp1();
706 GenTree* op2 = tree->gtGetOp2();
707
708 // Commutative opers like add/mul/and/or/xor could reverse the order of
709 // operands if it is safe to do so. In such a case we would like op2 to be
710 // target preferenced instead of op1.
711 if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
712 {
713 op1 = op2;
714 op2 = tree->gtGetOp1();
715 }
716
717 // If we have a read-modify-write operation, we want to preference op1 to the target,
718 // if it is not contained.
719 if (!op1->isContained() && !op1->OperIs(GT_LIST))
720 {
721 return op1;
722 }
723 }
724 return nullptr;
725}
726
727//------------------------------------------------------------------------------
728// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
729//
730// Arguments:
731// tree - a binary tree node
732//
733// Return Value:
734// Returns true if we can use the read-modify-write instruction form
735//
736// Notes:
737// This is used to determine whether to preference the source to the destination register.
738//
739bool LinearScan::isRMWRegOper(GenTree* tree)
740{
741 // TODO-XArch-CQ: Make this more accurate.
742 // For now, We assume that most binary operators are of the RMW form.
743 assert(tree->OperIsBinary());
744
745 if (tree->OperIsCompare() || tree->OperIs(GT_CMP) || tree->OperIs(GT_BT))
746 {
747 return false;
748 }
749
750 switch (tree->OperGet())
751 {
752 // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
753 case GT_LEA:
754 case GT_STOREIND:
755 case GT_ARR_INDEX:
756 case GT_STORE_BLK:
757 case GT_STORE_OBJ:
758 case GT_SWITCH_TABLE:
759 case GT_LOCKADD:
760#ifdef _TARGET_X86_
761 case GT_LONG:
762#endif
763 return false;
764
765 // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
766 case GT_MUL:
767 return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
768
769#ifdef FEATURE_HW_INTRINSICS
770 case GT_HWIntrinsic:
771 return tree->isRMWHWIntrinsic(compiler);
772#endif // FEATURE_HW_INTRINSICS
773
774 default:
775 return true;
776 }
777}
778
779// Support for building RefPositions for RMW nodes.
780int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
781{
782 int srcCount = 0;
783 GenTree* op1 = node->gtOp1;
784 GenTree* op2 = node->gtGetOp2IfPresent();
785 bool isReverseOp = node->IsReverseOp();
786 regMaskTP op1Candidates = candidates;
787 regMaskTP op2Candidates = candidates;
788
789#ifdef _TARGET_X86_
790 if (varTypeIsByte(node))
791 {
792 regMaskTP byteCandidates = (candidates == RBM_NONE) ? allByteRegs() : (candidates & allByteRegs());
793 if (!op1->isContained())
794 {
795 assert(byteCandidates != RBM_NONE);
796 op1Candidates = byteCandidates;
797 }
798 if (node->OperIsCommutative() && !op2->isContained())
799 {
800 assert(byteCandidates != RBM_NONE);
801 op2Candidates = byteCandidates;
802 }
803 }
804#endif // _TARGET_X86_
805
806 GenTree* tgtPrefOperand = getTgtPrefOperand(node);
807 assert((tgtPrefOperand == nullptr) || (tgtPrefOperand == op1) || node->OperIsCommutative());
808 assert(!isReverseOp || node->OperIsCommutative());
809
810 // Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
811 // but if we have a commutative operator and op1 is a contained memory op, it would be op1.
812 // We need to make the delayRegFree operand remain live until the op is complete, by marking
813 // the source(s) associated with op2 as "delayFree".
814 // Note that if op2 of a binary RMW operator is a memory op, even if the operator
815 // is commutative, codegen cannot reverse them.
816 // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
817 // more work to be done to correctly reverse the operands if they involve memory
818 // operands. Also, we may need to handle more cases than GT_IND, especially once
819 // we've modified the register allocator to not require all nodes to be assigned
820 // a register (e.g. a spilled lclVar can often be referenced directly from memory).
821 // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
822 GenTree* delayUseOperand = op2;
823 if (node->OperIsCommutative())
824 {
825 if (op1->isContained() && op2 != nullptr)
826 {
827 delayUseOperand = op1;
828 }
829 else if (!op2->isContained() || op2->IsCnsIntOrI())
830 {
831 // If we have a commutative operator and op2 is not a memory op, we don't need
832 // to set delayRegFree on either operand because codegen can swap them.
833 delayUseOperand = nullptr;
834 }
835 }
836 else if (op1->isContained())
837 {
838 delayUseOperand = nullptr;
839 }
840 if (delayUseOperand != nullptr)
841 {
842 assert(delayUseOperand != tgtPrefOperand);
843 }
844
845 if (isReverseOp)
846 {
847 op1 = op2;
848 op2 = node->gtOp1;
849 }
850
851 // Build first use
852 if (tgtPrefOperand == op1)
853 {
854 assert(!op1->isContained());
855 tgtPrefUse = BuildUse(op1, op1Candidates);
856 srcCount++;
857 }
858 else if (delayUseOperand == op1)
859 {
860 srcCount += BuildDelayFreeUses(op1, op1Candidates);
861 }
862 else
863 {
864 srcCount += BuildOperandUses(op1, op1Candidates);
865 }
866 // Build second use
867 if (op2 != nullptr)
868 {
869 if (tgtPrefOperand == op2)
870 {
871 assert(!op2->isContained());
872 tgtPrefUse = BuildUse(op2, op2Candidates);
873 srcCount++;
874 }
875 else if (delayUseOperand == op2)
876 {
877 srcCount += BuildDelayFreeUses(op2, op2Candidates);
878 }
879 else
880 {
881 srcCount += BuildOperandUses(op2, op2Candidates);
882 }
883 }
884 return srcCount;
885}
886
887//------------------------------------------------------------------------
888// BuildShiftRotate: Set the NodeInfo for a shift or rotate.
889//
890// Arguments:
891// tree - The node of interest
892//
893// Return Value:
894// The number of sources consumed by this node.
895//
896int LinearScan::BuildShiftRotate(GenTree* tree)
897{
898 // For shift operations, we need that the number
899 // of bits moved gets stored in CL in case
900 // the number of bits to shift is not a constant.
901 int srcCount = 0;
902 GenTree* shiftBy = tree->gtGetOp2();
903 GenTree* source = tree->gtGetOp1();
904 regMaskTP srcCandidates = RBM_NONE;
905 regMaskTP dstCandidates = RBM_NONE;
906
907 // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
908 // We will allow whatever can be encoded - hope you know what you are doing.
909 if (shiftBy->isContained())
910 {
911 assert(shiftBy->OperIsConst());
912 }
913 else
914 {
915 srcCandidates = allRegs(TYP_INT) & ~RBM_RCX;
916 dstCandidates = allRegs(TYP_INT) & ~RBM_RCX;
917 }
918
919 // Note that Rotate Left/Right instructions don't set ZF and SF flags.
920 //
921 // If the operand being shifted is 32-bits then upper three bits are masked
922 // by hardware to get actual shift count. Similarly for 64-bit operands
923 // shift count is narrowed to [0..63]. If the resulting shift count is zero,
924 // then shift operation won't modify flags.
925 //
926 // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
927 // if the shift count is known to be non-zero and in the range depending on the
928 // operand size.
929 CLANG_FORMAT_COMMENT_ANCHOR;
930
931#ifdef _TARGET_X86_
932 // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
933 // we can have a three operand form.
934 if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
935 {
936 assert((source->OperGet() == GT_LONG) && source->isContained());
937
938 GenTree* sourceLo = source->gtGetOp1();
939 GenTree* sourceHi = source->gtGetOp2();
940 assert(!sourceLo->isContained() && !sourceHi->isContained());
941 RefPosition* sourceLoUse = BuildUse(sourceLo, srcCandidates);
942 RefPosition* sourceHiUse = BuildUse(sourceHi, srcCandidates);
943
944 if (!tree->isContained())
945 {
946 if (tree->OperGet() == GT_LSH_HI)
947 {
948 setDelayFree(sourceLoUse);
949 }
950 else
951 {
952 setDelayFree(sourceHiUse);
953 }
954 }
955 }
956 else
957#endif
958 if (!source->isContained())
959 {
960 tgtPrefUse = BuildUse(source, srcCandidates);
961 srcCount++;
962 }
963 else
964 {
965 srcCount += BuildOperandUses(source, srcCandidates);
966 }
967 if (!tree->isContained())
968 {
969 if (!shiftBy->isContained())
970 {
971 srcCount += BuildDelayFreeUses(shiftBy, RBM_RCX);
972 buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
973 }
974 BuildDef(tree, dstCandidates);
975 }
976 else
977 {
978 if (!shiftBy->isContained())
979 {
980 srcCount += BuildOperandUses(shiftBy, RBM_RCX);
981 buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
982 }
983 }
984 return srcCount;
985}
986
987//------------------------------------------------------------------------
988// BuildCall: Set the NodeInfo for a call.
989//
990// Arguments:
991// call - The call node of interest
992//
993// Return Value:
994// The number of sources consumed by this node.
995//
996int LinearScan::BuildCall(GenTreeCall* call)
997{
998 bool hasMultiRegRetVal = false;
999 ReturnTypeDesc* retTypeDesc = nullptr;
1000 int srcCount = 0;
1001 int dstCount = 0;
1002 regMaskTP dstCandidates = RBM_NONE;
1003
1004 assert(!call->isContained());
1005 if (call->TypeGet() != TYP_VOID)
1006 {
1007 hasMultiRegRetVal = call->HasMultiRegRetVal();
1008 if (hasMultiRegRetVal)
1009 {
1010 // dst count = number of registers in which the value is returned by call
1011 retTypeDesc = call->GetReturnTypeDesc();
1012 dstCount = retTypeDesc->GetReturnRegCount();
1013 }
1014 else
1015 {
1016 dstCount = 1;
1017 }
1018 }
1019
1020 GenTree* ctrlExpr = call->gtControlExpr;
1021 if (call->gtCallType == CT_INDIRECT)
1022 {
1023 ctrlExpr = call->gtCallAddr;
1024 }
1025
1026 RegisterType registerType = call->TypeGet();
1027
1028 // Set destination candidates for return value of the call.
1029 CLANG_FORMAT_COMMENT_ANCHOR;
1030
1031#ifdef _TARGET_X86_
1032 if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
1033 {
1034 // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
1035 // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
1036 // correct argument registers.
1037 dstCandidates = RBM_PINVOKE_TCB;
1038 }
1039 else
1040#endif // _TARGET_X86_
1041 if (hasMultiRegRetVal)
1042 {
1043 assert(retTypeDesc != nullptr);
1044 dstCandidates = retTypeDesc->GetABIReturnRegs();
1045 assert((int)genCountBits(dstCandidates) == dstCount);
1046 }
1047 else if (varTypeIsFloating(registerType))
1048 {
1049#ifdef _TARGET_X86_
1050 // The return value will be on the X87 stack, and we will need to move it.
1051 dstCandidates = allRegs(registerType);
1052#else // !_TARGET_X86_
1053 dstCandidates = RBM_FLOATRET;
1054#endif // !_TARGET_X86_
1055 }
1056 else if (registerType == TYP_LONG)
1057 {
1058 dstCandidates = RBM_LNGRET;
1059 }
1060 else
1061 {
1062 dstCandidates = RBM_INTRET;
1063 }
1064
1065 // number of args to a call =
1066 // callRegArgs + (callargs - placeholders, setup, etc)
1067 // there is an explicit thisPtr but it is redundant
1068
1069 bool callHasFloatRegArgs = false;
1070 bool isVarArgs = call->IsVarargs();
1071
1072 // First, determine internal registers.
1073 // We will need one for any float arguments to a varArgs call.
1074 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1075 {
1076 GenTree* argNode = list->Current();
1077 if (argNode->OperIsPutArgReg())
1078 {
1079 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1080 }
1081 else if (argNode->OperGet() == GT_FIELD_LIST)
1082 {
1083 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1084 {
1085 assert(entry->Current()->OperIsPutArgReg());
1086 HandleFloatVarArgs(call, entry->Current(), &callHasFloatRegArgs);
1087 }
1088 }
1089 }
1090
1091 // Now, count reg args
1092 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1093 {
1094 // By this point, lowering has ensured that all call arguments are one of the following:
1095 // - an arg setup store
1096 // - an arg placeholder
1097 // - a nop
1098 // - a copy blk
1099 // - a field list
1100 // - a put arg
1101 //
1102 // Note that this property is statically checked by LinearScan::CheckBlock.
1103 GenTree* argNode = list->Current();
1104
1105 // Each register argument corresponds to one source.
1106 if (argNode->OperIsPutArgReg())
1107 {
1108 srcCount++;
1109 BuildUse(argNode, genRegMask(argNode->gtRegNum));
1110 }
1111#ifdef UNIX_AMD64_ABI
1112 else if (argNode->OperGet() == GT_FIELD_LIST)
1113 {
1114 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1115 {
1116 assert(entry->Current()->OperIsPutArgReg());
1117 srcCount++;
1118 BuildUse(entry->Current(), genRegMask(entry->Current()->gtRegNum));
1119 }
1120 }
1121#endif // UNIX_AMD64_ABI
1122
1123#ifdef DEBUG
1124 // In DEBUG only, check validity with respect to the arg table entry.
1125
1126 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1127 assert(curArgTabEntry);
1128
1129 if (curArgTabEntry->regNum == REG_STK)
1130 {
1131 // late arg that is not passed in a register
1132 assert(argNode->gtOper == GT_PUTARG_STK);
1133
1134#ifdef FEATURE_PUT_STRUCT_ARG_STK
1135 // If the node is TYP_STRUCT and it is put on stack with
1136 // putarg_stk operation, we consume and produce no registers.
1137 // In this case the embedded Obj node should not produce
1138 // registers too since it is contained.
1139 // Note that if it is a SIMD type the argument will be in a register.
1140 if (argNode->TypeGet() == TYP_STRUCT)
1141 {
1142 assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
1143 assert(argNode->gtGetOp1()->isContained());
1144 }
1145#endif // FEATURE_PUT_STRUCT_ARG_STK
1146 continue;
1147 }
1148#ifdef UNIX_AMD64_ABI
1149 if (argNode->OperGet() == GT_FIELD_LIST)
1150 {
1151 assert(argNode->isContained());
1152 assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1153
1154 int i = 0;
1155 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1156 {
1157 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1158 assert(entry->Current()->gtRegNum == argReg);
1159 assert(i < 2);
1160 i++;
1161 }
1162 }
1163 else
1164#endif // UNIX_AMD64_ABI
1165 {
1166 const regNumber argReg = curArgTabEntry->regNum;
1167 assert(argNode->gtRegNum == argReg);
1168 }
1169#endif // DEBUG
1170 }
1171
1172 // Now, count stack args
1173 // Note that these need to be computed into a register, but then
1174 // they're just stored to the stack - so the reg doesn't
1175 // need to remain live until the call. In fact, it must not
1176 // because the code generator doesn't actually consider it live,
1177 // so it can't be spilled.
1178
1179 GenTree* args = call->gtCallArgs;
1180 while (args)
1181 {
1182 GenTree* arg = args->gtGetOp1();
1183 if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1184 {
1185 if (arg->IsValue() && !arg->isContained())
1186 {
1187 assert(arg->IsUnusedValue());
1188 }
1189 }
1190 args = args->gtGetOp2();
1191 }
1192
1193 // set reg requirements on call target represented as control sequence.
1194 if (ctrlExpr != nullptr)
1195 {
1196 regMaskTP ctrlExprCandidates = RBM_NONE;
1197
1198 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1199 // computed into a register.
1200 if (call->IsFastTailCall())
1201 {
1202 assert(!ctrlExpr->isContained());
1203 // Fast tail call - make sure that call target is always computed in RAX
1204 // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1205 ctrlExprCandidates = RBM_RAX;
1206 }
1207#ifdef _TARGET_X86_
1208 else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1209 {
1210 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1211 //
1212 // 3-byte nop
1213 // call dword ptr [eax]
1214 //
1215 // Where EAX is also used as an argument to the stub dispatch helper. Make
1216 // sure that the call target address is computed into EAX in this case.
1217 assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1218 ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
1219 }
1220#endif // _TARGET_X86_
1221
1222#if FEATURE_VARARG
1223 // If it is a fast tail call, it is already preferenced to use RAX.
1224 // Therefore, no need set src candidates on call tgt again.
1225 if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
1226 {
1227 // Don't assign the call target to any of the argument registers because
1228 // we will use them to also pass floating point arguments as required
1229 // by Amd64 ABI.
1230 ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
1231 }
1232#endif // !FEATURE_VARARG
1233 srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
1234 }
1235
1236 buildInternalRegisterUses();
1237
1238 // Now generate defs and kills.
1239 regMaskTP killMask = getKillSetForCall(call);
1240 BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
1241 return srcCount;
1242}
1243
1244//------------------------------------------------------------------------
1245// BuildBlockStore: Set the NodeInfo for a block store.
1246//
1247// Arguments:
1248// blkNode - The block store node of interest
1249//
1250// Return Value:
1251// The number of sources consumed by this node.
1252//
1253int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1254{
1255 GenTree* dstAddr = blkNode->Addr();
1256 unsigned size = blkNode->gtBlkSize;
1257 GenTree* source = blkNode->Data();
1258 int srcCount = 0;
1259
1260 GenTree* srcAddrOrFill = nullptr;
1261 bool isInitBlk = blkNode->OperIsInitBlkOp();
1262
1263 regMaskTP dstAddrRegMask = RBM_NONE;
1264 regMaskTP sourceRegMask = RBM_NONE;
1265 regMaskTP blkSizeRegMask = RBM_NONE;
1266
1267 if (isInitBlk)
1268 {
1269 GenTree* initVal = source;
1270 if (initVal->OperIsInitVal())
1271 {
1272 assert(initVal->isContained());
1273 initVal = initVal->gtGetOp1();
1274 }
1275 srcAddrOrFill = initVal;
1276
1277 switch (blkNode->gtBlkOpKind)
1278 {
1279 case GenTreeBlk::BlkOpKindUnroll:
1280 assert(initVal->IsCnsIntOrI());
1281 if (size >= XMM_REGSIZE_BYTES)
1282 {
1283 // Reserve an XMM register to fill it with a pack of 16 init value constants.
1284 buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1285 // use XMM register to fill with constants, it's AVX instruction and set the flag
1286 SetContainsAVXFlags();
1287 }
1288#ifdef _TARGET_X86_
1289 if ((size & 1) != 0)
1290 {
1291 // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1292 // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1293 // when unrolling, so only allow byteable registers as the source value. (We could
1294 // consider just using BlkOpKindRepInstr instead.)
1295 sourceRegMask = allByteRegs();
1296 }
1297#endif // _TARGET_X86_
1298 break;
1299
1300 case GenTreeBlk::BlkOpKindRepInstr:
1301 // rep stos has the following register requirements:
1302 // a) The memory address to be in RDI.
1303 // b) The fill value has to be in RAX.
1304 // c) The buffer size will go in RCX.
1305 dstAddrRegMask = RBM_RDI;
1306 sourceRegMask = RBM_RAX;
1307 blkSizeRegMask = RBM_RCX;
1308 break;
1309
1310 case GenTreeBlk::BlkOpKindHelper:
1311#ifdef _TARGET_AMD64_
1312 // The helper follows the regular AMD64 ABI.
1313 dstAddrRegMask = RBM_ARG_0;
1314 sourceRegMask = RBM_ARG_1;
1315 blkSizeRegMask = RBM_ARG_2;
1316#else // !_TARGET_AMD64_
1317 dstAddrRegMask = RBM_RDI;
1318 sourceRegMask = RBM_RAX;
1319 blkSizeRegMask = RBM_RCX;
1320#endif // !_TARGET_AMD64_
1321 break;
1322
1323 default:
1324 unreached();
1325 }
1326 }
1327 else
1328 {
1329 // CopyObj or CopyBlk
1330 if (source->gtOper == GT_IND)
1331 {
1332 assert(source->isContained());
1333 srcAddrOrFill = source->gtGetOp1();
1334 }
1335 if (blkNode->OperGet() == GT_STORE_OBJ)
1336 {
1337 if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1338 {
1339 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1340 blkSizeRegMask = RBM_RCX;
1341 }
1342 // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its
1343 // sources.
1344 sourceRegMask = RBM_RSI;
1345 dstAddrRegMask = RBM_RDI;
1346 }
1347 else
1348 {
1349 switch (blkNode->gtBlkOpKind)
1350 {
1351 case GenTreeBlk::BlkOpKindUnroll:
1352 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1353 //
1354 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1355 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1356 // RBM_NON_BYTE_REGS from internal candidates.
1357 if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1358 {
1359 regMaskTP regMask = allRegs(TYP_INT);
1360
1361#ifdef _TARGET_X86_
1362 if ((size & 1) != 0)
1363 {
1364 regMask &= ~RBM_NON_BYTE_REGS;
1365 }
1366#endif
1367 buildInternalIntRegisterDefForNode(blkNode, regMask);
1368 }
1369
1370 if (size >= XMM_REGSIZE_BYTES)
1371 {
1372 // If we have a buffer larger than XMM_REGSIZE_BYTES,
1373 // reserve an XMM register to use it for a
1374 // series of 16-byte loads and stores.
1375 buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1376 // Uses XMM reg for load and store and hence check to see whether AVX instructions
1377 // are used for codegen, set ContainsAVX flag
1378 SetContainsAVXFlags();
1379 }
1380 break;
1381
1382 case GenTreeBlk::BlkOpKindRepInstr:
1383 // rep stos has the following register requirements:
1384 // a) The dest address has to be in RDI.
1385 // b) The src address has to be in RSI.
1386 // c) The buffer size will go in RCX.
1387 dstAddrRegMask = RBM_RDI;
1388 sourceRegMask = RBM_RSI;
1389 blkSizeRegMask = RBM_RCX;
1390 break;
1391
1392 case GenTreeBlk::BlkOpKindHelper:
1393#ifdef _TARGET_AMD64_
1394 // The helper follows the regular AMD64 ABI.
1395 dstAddrRegMask = RBM_ARG_0;
1396 sourceRegMask = RBM_ARG_1;
1397 blkSizeRegMask = RBM_ARG_2;
1398#else // !_TARGET_AMD64_
1399 dstAddrRegMask = RBM_RDI;
1400 sourceRegMask = RBM_RAX;
1401 blkSizeRegMask = RBM_RCX;
1402#endif // !_TARGET_AMD64_
1403 break;
1404
1405 default:
1406 unreached();
1407 }
1408 }
1409 if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE))
1410 {
1411 // This is a local source; we'll use a temp register for its address.
1412 assert(source->isContained() && source->OperIsLocal());
1413 buildInternalIntRegisterDefForNode(blkNode, sourceRegMask);
1414 }
1415 }
1416
1417 if ((size != 0) && (blkSizeRegMask != RBM_NONE))
1418 {
1419 // Reserve a temp register for the block size argument.
1420 buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
1421 }
1422
1423 if (!dstAddr->isContained() && !blkNode->IsReverseOp())
1424 {
1425 srcCount++;
1426 BuildUse(dstAddr, dstAddrRegMask);
1427 }
1428 if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
1429 {
1430 srcCount++;
1431 BuildUse(srcAddrOrFill, sourceRegMask);
1432 }
1433 if (!dstAddr->isContained() && blkNode->IsReverseOp())
1434 {
1435 srcCount++;
1436 BuildUse(dstAddr, dstAddrRegMask);
1437 }
1438
1439 if (size == 0)
1440 {
1441 assert(blkNode->OperIs(GT_STORE_DYN_BLK));
1442 // The block size argument is a third argument to GT_STORE_DYN_BLK
1443 srcCount++;
1444 GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1445 BuildUse(blockSize, blkSizeRegMask);
1446 }
1447 buildInternalRegisterUses();
1448 regMaskTP killMask = getKillSetForBlockStore(blkNode);
1449 BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
1450 return srcCount;
1451}
1452
1453#ifdef FEATURE_PUT_STRUCT_ARG_STK
1454//------------------------------------------------------------------------
1455// BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1456//
1457// Arguments:
1458// tree - The node of interest
1459//
1460// Return Value:
1461// The number of sources consumed by this node.
1462//
1463int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1464{
1465 int srcCount = 0;
1466 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1467 {
1468 assert(putArgStk->gtOp1->isContained());
1469
1470 RefPosition* simdTemp = nullptr;
1471 RefPosition* intTemp = nullptr;
1472 unsigned prevOffset = putArgStk->getArgSize();
1473 // We need to iterate over the fields twice; once to determine the need for internal temps,
1474 // and once to actually build the uses.
1475 for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1476 {
1477 GenTree* const fieldNode = current->Current();
1478 const var_types fieldType = fieldNode->TypeGet();
1479 const unsigned fieldOffset = current->gtFieldOffset;
1480
1481#ifdef _TARGET_X86_
1482 assert(fieldType != TYP_LONG);
1483#endif // _TARGET_X86_
1484
1485#if defined(FEATURE_SIMD)
1486 // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1487 // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1488 // we "round up" to 16.
1489 if ((current->gtFieldType == TYP_SIMD12) && (simdTemp == nullptr))
1490 {
1491 simdTemp = buildInternalFloatRegisterDefForNode(putArgStk);
1492 }
1493#endif // defined(FEATURE_SIMD)
1494
1495#ifdef _TARGET_X86_
1496 if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1497 {
1498 // We can treat as a slot any field that is stored at a slot boundary, where the previous
1499 // field is not in the same slot. (Note that we store the fields in reverse order.)
1500 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1501 if (intTemp == nullptr)
1502 {
1503 intTemp = buildInternalIntRegisterDefForNode(putArgStk);
1504 }
1505 if (!fieldIsSlot && varTypeIsByte(fieldType))
1506 {
1507 // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1508 // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1509 // need a byte-addressable register for the store. We will enforce this requirement on an internal
1510 // register, which we can use to copy multiple byte values.
1511 intTemp->registerAssignment &= allByteRegs();
1512 }
1513 }
1514#endif // _TARGET_X86_
1515
1516 if (varTypeIsGC(fieldType))
1517 {
1518 putArgStk->gtNumberReferenceSlots++;
1519 }
1520 prevOffset = fieldOffset;
1521 }
1522
1523 for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1524 {
1525 GenTree* const fieldNode = current->Current();
1526 if (!fieldNode->isContained())
1527 {
1528 BuildUse(fieldNode);
1529 srcCount++;
1530 }
1531 }
1532 buildInternalRegisterUses();
1533
1534 return srcCount;
1535 }
1536
1537 GenTree* src = putArgStk->gtOp1;
1538 var_types type = src->TypeGet();
1539
1540#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1541 // For PutArgStk of a TYP_SIMD12, we need an extra register.
1542 if (putArgStk->isSIMD12())
1543 {
1544 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1545 BuildUse(putArgStk->gtOp1);
1546 srcCount = 1;
1547 buildInternalRegisterUses();
1548 return srcCount;
1549 }
1550#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1551
1552 if (type != TYP_STRUCT)
1553 {
1554 return BuildSimple(putArgStk);
1555 }
1556
1557 GenTree* dst = putArgStk;
1558 GenTree* srcAddr = nullptr;
1559
1560 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1561 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1562 // our framework assemblies, so this is the main code generation scheme we'll use.
1563 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1564 switch (putArgStk->gtPutArgStkKind)
1565 {
1566 case GenTreePutArgStk::Kind::Push:
1567 case GenTreePutArgStk::Kind::PushAllSlots:
1568 case GenTreePutArgStk::Kind::Unroll:
1569 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1570 //
1571 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1572 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1573 // RBM_NON_BYTE_REGS from internal candidates.
1574 if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1575 {
1576 regMaskTP regMask = allRegs(TYP_INT);
1577
1578#ifdef _TARGET_X86_
1579 if ((size % 2) != 0)
1580 {
1581 regMask &= ~RBM_NON_BYTE_REGS;
1582 }
1583#endif
1584 buildInternalIntRegisterDefForNode(putArgStk, regMask);
1585 }
1586
1587#ifdef _TARGET_X86_
1588 if (size >= 8)
1589#else // !_TARGET_X86_
1590 if (size >= XMM_REGSIZE_BYTES)
1591#endif // !_TARGET_X86_
1592 {
1593 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1594 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1595 // series of 16-byte loads and stores.
1596 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1597 SetContainsAVXFlags();
1598 }
1599 break;
1600
1601 case GenTreePutArgStk::Kind::RepInstr:
1602 buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI);
1603 buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX);
1604 buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI);
1605 break;
1606
1607 default:
1608 unreached();
1609 }
1610
1611 srcCount = BuildOperandUses(src);
1612 buildInternalRegisterUses();
1613 return srcCount;
1614}
1615#endif // FEATURE_PUT_STRUCT_ARG_STK
1616
1617//------------------------------------------------------------------------
1618// BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1619//
1620// Arguments:
1621// tree - The node of interest
1622//
1623// Return Value:
1624// The number of sources consumed by this node.
1625//
1626int LinearScan::BuildLclHeap(GenTree* tree)
1627{
1628 int srcCount = 1;
1629
1630 // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1631 // Here '-' means don't care.
1632 //
1633 // Size? Init Memory? # temp regs
1634 // 0 - 0 (returns 0)
1635 // const and <=6 reg words - 0 (pushes '0')
1636 // const and >6 reg words Yes 0 (pushes '0')
1637 // const and <PageSize No 0 (amd64) 1 (x86)
1638 // (x86:tmpReg for sutracting from esp)
1639 // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
1640 // Non-const Yes 0 (regCnt=targetReg and pushes '0')
1641 // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
1642 //
1643 // Note: Here we don't need internal register to be different from targetReg.
1644 // Rather, require it to be different from operand's reg.
1645
1646 GenTree* size = tree->gtGetOp1();
1647 if (size->IsCnsIntOrI())
1648 {
1649 assert(size->isContained());
1650 srcCount = 0;
1651 size_t sizeVal = size->gtIntCon.gtIconVal;
1652
1653 if (sizeVal == 0)
1654 {
1655 buildInternalIntRegisterDefForNode(tree);
1656 }
1657 else
1658 {
1659 // Compute the amount of memory to properly STACK_ALIGN.
1660 // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1661 // This should also help in debugging as we can examine the original size specified with localloc.
1662 sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1663
1664 // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1665 // we will generate 'push 0'.
1666 assert((sizeVal % REGSIZE_BYTES) == 0);
1667 size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1668 if (cntRegSizedWords > 6)
1669 {
1670 if (!compiler->info.compInitMem)
1671 {
1672 // No need to initialize allocated stack space.
1673 if (sizeVal < compiler->eeGetPageSize())
1674 {
1675#ifdef _TARGET_X86_
1676 // x86 needs a register here to avoid generating "sub" on ESP.
1677 buildInternalIntRegisterDefForNode(tree);
1678#endif
1679 }
1680 else
1681 {
1682 // We need two registers: regCnt and RegTmp
1683 buildInternalIntRegisterDefForNode(tree);
1684 buildInternalIntRegisterDefForNode(tree);
1685 }
1686 }
1687 }
1688 }
1689 }
1690 else
1691 {
1692 if (!compiler->info.compInitMem)
1693 {
1694 buildInternalIntRegisterDefForNode(tree);
1695 buildInternalIntRegisterDefForNode(tree);
1696 }
1697 BuildUse(size);
1698 }
1699 buildInternalRegisterUses();
1700 BuildDef(tree);
1701 return srcCount;
1702}
1703
1704//------------------------------------------------------------------------
1705// BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1706//
1707// Arguments:
1708// tree - The node of interest
1709//
1710// Return Value:
1711// The number of sources consumed by this node.
1712//
1713int LinearScan::BuildModDiv(GenTree* tree)
1714{
1715 GenTree* op1 = tree->gtGetOp1();
1716 GenTree* op2 = tree->gtGetOp2();
1717 regMaskTP dstCandidates = RBM_NONE;
1718 RefPosition* internalDef = nullptr;
1719 int srcCount = 0;
1720
1721 if (varTypeIsFloating(tree->TypeGet()))
1722 {
1723 return BuildSimple(tree);
1724 }
1725
1726 // Amd64 Div/Idiv instruction:
1727 // Dividend in RAX:RDX and computes
1728 // Quotient in RAX, Remainder in RDX
1729
1730 if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1731 {
1732 // We are interested in just the remainder.
1733 // RAX is used as a trashable register during computation of remainder.
1734 dstCandidates = RBM_RDX;
1735 }
1736 else
1737 {
1738 // We are interested in just the quotient.
1739 // RDX gets used as trashable register during computation of quotient
1740 dstCandidates = RBM_RAX;
1741 }
1742
1743#ifdef _TARGET_X86_
1744 if (op1->OperGet() == GT_LONG)
1745 {
1746 assert(op1->isContained());
1747
1748 // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1749 GenTree* loVal = op1->gtGetOp1();
1750 GenTree* hiVal = op1->gtGetOp2();
1751 assert(!loVal->isContained() && !hiVal->isContained());
1752
1753 assert(op2->IsCnsIntOrI());
1754 assert(tree->OperGet() == GT_UMOD);
1755
1756 // This situation also requires an internal register.
1757 buildInternalIntRegisterDefForNode(tree);
1758
1759 BuildUse(loVal, RBM_EAX);
1760 BuildUse(hiVal, RBM_EDX);
1761 srcCount = 2;
1762 }
1763 else
1764#endif
1765 {
1766 // If possible would like to have op1 in RAX to avoid a register move.
1767 RefPosition* op1Use = BuildUse(op1, RBM_EAX);
1768 tgtPrefUse = op1Use;
1769 srcCount = 1;
1770 }
1771
1772 srcCount += BuildDelayFreeUses(op2, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1773
1774 buildInternalRegisterUses();
1775
1776 regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
1777 BuildDefsWithKills(tree, 1, dstCandidates, killMask);
1778 return srcCount;
1779}
1780
1781//------------------------------------------------------------------------
1782// BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1783//
1784// Arguments:
1785// tree - The node of interest
1786//
1787// Return Value:
1788// The number of sources consumed by this node.
1789//
1790int LinearScan::BuildIntrinsic(GenTree* tree)
1791{
1792 // Both operand and its result must be of floating point type.
1793 GenTree* op1 = tree->gtGetOp1();
1794 assert(varTypeIsFloating(op1));
1795 assert(op1->TypeGet() == tree->TypeGet());
1796 RefPosition* internalFloatDef = nullptr;
1797
1798 switch (tree->gtIntrinsic.gtIntrinsicId)
1799 {
1800 case CORINFO_INTRINSIC_Abs:
1801 // Abs(float x) = x & 0x7fffffff
1802 // Abs(double x) = x & 0x7ffffff ffffffff
1803
1804 // In case of Abs we need an internal register to hold mask.
1805
1806 // TODO-XArch-CQ: avoid using an internal register for the mask.
1807 // Andps or andpd both will operate on 128-bit operands.
1808 // The data section constant to hold the mask is a 64-bit size.
1809 // Therefore, we need both the operand and mask to be in
1810 // xmm register. When we add support in emitter to emit 128-bit
1811 // data constants and instructions that operate on 128-bit
1812 // memory operands we can avoid the need for an internal register.
1813 if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1814 {
1815 internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
1816 }
1817 break;
1818
1819#ifdef _TARGET_X86_
1820 case CORINFO_INTRINSIC_Cos:
1821 case CORINFO_INTRINSIC_Sin:
1822 NYI_X86("Math intrinsics Cos and Sin");
1823 break;
1824#endif // _TARGET_X86_
1825
1826 case CORINFO_INTRINSIC_Sqrt:
1827 case CORINFO_INTRINSIC_Round:
1828 case CORINFO_INTRINSIC_Ceiling:
1829 case CORINFO_INTRINSIC_Floor:
1830 break;
1831
1832 default:
1833 // Right now only Sqrt/Abs are treated as math intrinsics
1834 noway_assert(!"Unsupported math intrinsic");
1835 unreached();
1836 break;
1837 }
1838 assert(tree->gtGetOp2IfPresent() == nullptr);
1839 int srcCount;
1840 if (op1->isContained())
1841 {
1842 srcCount = BuildOperandUses(op1);
1843 }
1844 else
1845 {
1846 tgtPrefUse = BuildUse(op1);
1847 srcCount = 1;
1848 }
1849 if (internalFloatDef != nullptr)
1850 {
1851 buildInternalRegisterUses();
1852 }
1853 BuildDef(tree);
1854 return srcCount;
1855}
1856
1857#ifdef FEATURE_SIMD
1858//------------------------------------------------------------------------
1859// BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1860//
1861// Arguments:
1862// tree - The GT_SIMD node of interest
1863//
1864// Return Value:
1865// The number of sources consumed by this node.
1866//
1867int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1868{
1869 // Only SIMDIntrinsicInit can be contained. Other than that,
1870 // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1871 int dstCount = simdTree->IsValue() ? 1 : 0;
1872 bool buildUses = true;
1873 regMaskTP dstCandidates = RBM_NONE;
1874
1875 if (simdTree->isContained())
1876 {
1877 assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1878 }
1879 else if (dstCount != 1)
1880 {
1881 assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1882 (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1883 }
1884 SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1885 GenTree* op1 = simdTree->gtGetOp1();
1886 GenTree* op2 = simdTree->gtGetOp2();
1887 int srcCount = 0;
1888
1889 switch (simdTree->gtSIMDIntrinsicID)
1890 {
1891 case SIMDIntrinsicInit:
1892 {
1893 // This sets all fields of a SIMD struct to the given value.
1894 // Mark op1 as contained if it is either zero or int constant of all 1's,
1895 // or a float constant with 16 or 32 byte simdType (AVX case)
1896 //
1897 // Note that for small int base types, the initVal has been constructed so that
1898 // we can use the full int value.
1899 CLANG_FORMAT_COMMENT_ANCHOR;
1900
1901#if !defined(_TARGET_64BIT_)
1902 if (op1->OperGet() == GT_LONG)
1903 {
1904 assert(op1->isContained());
1905 GenTree* op1lo = op1->gtGetOp1();
1906 GenTree* op1hi = op1->gtGetOp2();
1907
1908 if (op1lo->isContained())
1909 {
1910 srcCount = 0;
1911 assert(op1hi->isContained());
1912 assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1913 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1914 }
1915 else
1916 {
1917 srcCount = 2;
1918 buildInternalFloatRegisterDefForNode(simdTree);
1919 setInternalRegsDelayFree = true;
1920 }
1921
1922 if (srcCount == 2)
1923 {
1924 BuildUse(op1lo, RBM_EAX);
1925 BuildUse(op1hi, RBM_EDX);
1926 }
1927 buildUses = false;
1928 }
1929#endif // !defined(_TARGET_64BIT_)
1930 }
1931 break;
1932
1933 case SIMDIntrinsicInitN:
1934 {
1935 var_types baseType = simdTree->gtSIMDBaseType;
1936 srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1937 // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1938 buildInternalFloatRegisterDefForNode(simdTree);
1939 int initCount = 0;
1940 for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1941 {
1942 assert(list->OperGet() == GT_LIST);
1943 GenTree* listItem = list->gtGetOp1();
1944 assert(listItem->TypeGet() == baseType);
1945 assert(!listItem->isContained());
1946 BuildUse(listItem);
1947 initCount++;
1948 }
1949 assert(initCount == srcCount);
1950 buildUses = false;
1951 }
1952 break;
1953
1954 case SIMDIntrinsicInitArray:
1955 // We have an array and an index, which may be contained.
1956 break;
1957
1958 case SIMDIntrinsicDiv:
1959 // SSE2 has no instruction support for division on integer vectors
1960 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1961 break;
1962
1963 case SIMDIntrinsicAbs:
1964 // float/double vectors: This gets implemented as bitwise-And operation
1965 // with a mask and hence should never see here.
1966 //
1967 // Must be a Vector<int> or Vector<short> Vector<sbyte>
1968 assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1969 simdTree->gtSIMDBaseType == TYP_BYTE);
1970 assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1971 break;
1972
1973 case SIMDIntrinsicSqrt:
1974 // SSE2 has no instruction support for sqrt on integer vectors.
1975 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1976 break;
1977
1978 case SIMDIntrinsicAdd:
1979 case SIMDIntrinsicSub:
1980 case SIMDIntrinsicMul:
1981 case SIMDIntrinsicBitwiseAnd:
1982 case SIMDIntrinsicBitwiseAndNot:
1983 case SIMDIntrinsicBitwiseOr:
1984 case SIMDIntrinsicBitwiseXor:
1985 case SIMDIntrinsicMin:
1986 case SIMDIntrinsicMax:
1987 // SSE2 32-bit integer multiplication requires two temp regs
1988 if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1989 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1990 {
1991 buildInternalFloatRegisterDefForNode(simdTree);
1992 buildInternalFloatRegisterDefForNode(simdTree);
1993 }
1994 break;
1995
1996 case SIMDIntrinsicEqual:
1997 break;
1998
1999 // SSE2 doesn't support < and <= directly on int vectors.
2000 // Instead we need to use > and >= with swapped operands.
2001 case SIMDIntrinsicLessThan:
2002 case SIMDIntrinsicLessThanOrEqual:
2003 noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2004 break;
2005
2006 // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2007 // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors.
2008 // Instead we need to use < and <= with swapped operands.
2009 case SIMDIntrinsicGreaterThan:
2010 noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2011 break;
2012
2013 case SIMDIntrinsicOpEquality:
2014 case SIMDIntrinsicOpInEquality:
2015 if (simdTree->gtGetOp2()->isContained())
2016 {
2017 // If the second operand is contained then ContainCheckSIMD has determined
2018 // that PTEST can be used. We only need a single source register and no
2019 // internal registers.
2020 }
2021 else
2022 {
2023 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2024 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2025 // and one internal INT register (to hold the result of PMOVMSKB).
2026 buildInternalIntRegisterDefForNode(simdTree);
2027 buildInternalFloatRegisterDefForNode(simdTree);
2028 }
2029 // These SIMD nodes only set the condition flags.
2030 dstCount = 0;
2031 break;
2032
2033 case SIMDIntrinsicDotProduct:
2034 // Float/Double vectors:
2035 // For SSE, or AVX with 32-byte vectors, we also need an internal register
2036 // as scratch. Further we need the targetReg and internal reg to be distinct
2037 // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2038 // don't need a tmpReg.
2039 //
2040 // 32-byte integer vector on SSE4/AVX:
2041 // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2042 // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2043 // registers since targetReg is an int type register.
2044 //
2045 // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2046 // and the need for scratch registers.
2047 if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2048 {
2049 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2050 (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32))
2051 {
2052 buildInternalFloatRegisterDefForNode(simdTree);
2053 setInternalRegsDelayFree = true;
2054 }
2055 // else don't need scratch reg(s).
2056 }
2057 else
2058 {
2059 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2060
2061 // No need to setInternalRegsDelayFree since targetReg is a
2062 // an int type reg and guaranteed to be different from xmm/ymm
2063 // regs.
2064 buildInternalFloatRegisterDefForNode(simdTree);
2065 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2066 {
2067 buildInternalFloatRegisterDefForNode(simdTree);
2068 }
2069 }
2070 break;
2071
2072 case SIMDIntrinsicGetItem:
2073 {
2074 // This implements get_Item method. The sources are:
2075 // - the source SIMD struct
2076 // - index (which element to get)
2077 // The result is baseType of SIMD struct.
2078 // op1 may be a contained memory op, but if so we will consume its address.
2079 // op2 may be a contained constant.
2080 op1 = simdTree->gtGetOp1();
2081 op2 = simdTree->gtGetOp2();
2082
2083 if (!op1->isContained())
2084 {
2085 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2086 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2087 // can use that in the process of extracting the element.
2088 //
2089 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2090 // we will need a temp if are indexing into the upper half of the AVX register.
2091 // In all other cases with constant index, we need a temp xmm register to extract the
2092 // element if index is other than zero.
2093
2094 if (!op2->IsCnsIntOrI())
2095 {
2096 (void)compiler->getSIMDInitTempVarNum();
2097 }
2098 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2099 {
2100 bool needFloatTemp;
2101 if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2102 (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2103 {
2104 int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2105 needFloatTemp = (byteShiftCnt >= 16);
2106 }
2107 else
2108 {
2109 needFloatTemp = !op2->IsIntegralConst(0);
2110 }
2111
2112 if (needFloatTemp)
2113 {
2114 buildInternalFloatRegisterDefForNode(simdTree);
2115 }
2116 }
2117#ifdef _TARGET_X86_
2118 // This logic is duplicated from genSIMDIntrinsicGetItem().
2119 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2120 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2121 // cases will require this, so the non-byteable registers can be excluded.
2122
2123 var_types baseType = simdTree->gtSIMDBaseType;
2124 if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2125 {
2126 bool ZeroOrSignExtnReqd = true;
2127 unsigned baseSize = genTypeSize(baseType);
2128 if (baseSize == 1)
2129 {
2130 if ((op2->gtIntCon.gtIconVal % 2) == 1)
2131 {
2132 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2133 }
2134 }
2135 else
2136 {
2137 assert(baseSize == 2);
2138 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2139 }
2140 if (ZeroOrSignExtnReqd)
2141 {
2142 dstCandidates = allByteRegs();
2143 }
2144 }
2145#endif // _TARGET_X86_
2146 }
2147 }
2148 break;
2149
2150 case SIMDIntrinsicSetX:
2151 case SIMDIntrinsicSetY:
2152 case SIMDIntrinsicSetZ:
2153 case SIMDIntrinsicSetW:
2154 // We need an internal integer register for SSE2 codegen
2155 if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2156 {
2157 buildInternalIntRegisterDefForNode(simdTree);
2158 }
2159
2160 break;
2161
2162 case SIMDIntrinsicCast:
2163 break;
2164
2165 case SIMDIntrinsicConvertToSingle:
2166 if (simdTree->gtSIMDBaseType == TYP_UINT)
2167 {
2168 // We need an internal register different from targetReg.
2169 setInternalRegsDelayFree = true;
2170 buildInternalFloatRegisterDefForNode(simdTree);
2171 buildInternalFloatRegisterDefForNode(simdTree);
2172 // We also need an integer register.
2173 buildInternalIntRegisterDefForNode(simdTree);
2174 }
2175 break;
2176
2177 case SIMDIntrinsicConvertToInt32:
2178 break;
2179
2180 case SIMDIntrinsicWidenLo:
2181 case SIMDIntrinsicWidenHi:
2182 if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2183 {
2184 // We need an internal register different from targetReg.
2185 setInternalRegsDelayFree = true;
2186 buildInternalFloatRegisterDefForNode(simdTree);
2187 }
2188 break;
2189
2190 case SIMDIntrinsicConvertToInt64:
2191 // We need an internal register different from targetReg.
2192 setInternalRegsDelayFree = true;
2193 buildInternalFloatRegisterDefForNode(simdTree);
2194 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2195 {
2196 buildInternalFloatRegisterDefForNode(simdTree);
2197 }
2198 // We also need an integer register.
2199 buildInternalIntRegisterDefForNode(simdTree);
2200 break;
2201
2202 case SIMDIntrinsicConvertToDouble:
2203 // We need an internal register different from targetReg.
2204 setInternalRegsDelayFree = true;
2205 buildInternalFloatRegisterDefForNode(simdTree);
2206#ifdef _TARGET_X86_
2207 if (simdTree->gtSIMDBaseType == TYP_LONG)
2208 {
2209 buildInternalFloatRegisterDefForNode(simdTree);
2210 buildInternalFloatRegisterDefForNode(simdTree);
2211 }
2212 else
2213#endif
2214 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2215 {
2216 buildInternalFloatRegisterDefForNode(simdTree);
2217 }
2218 // We also need an integer register.
2219 buildInternalIntRegisterDefForNode(simdTree);
2220 break;
2221
2222 case SIMDIntrinsicNarrow:
2223 // We need an internal register different from targetReg.
2224 setInternalRegsDelayFree = true;
2225 buildInternalFloatRegisterDefForNode(simdTree);
2226 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2227 {
2228 buildInternalFloatRegisterDefForNode(simdTree);
2229 }
2230 break;
2231
2232 case SIMDIntrinsicShuffleSSE2:
2233 // Second operand is an integer constant and marked as contained.
2234 assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
2235 break;
2236
2237 case SIMDIntrinsicGetX:
2238 case SIMDIntrinsicGetY:
2239 case SIMDIntrinsicGetZ:
2240 case SIMDIntrinsicGetW:
2241 case SIMDIntrinsicGetOne:
2242 case SIMDIntrinsicGetZero:
2243 case SIMDIntrinsicGetCount:
2244 case SIMDIntrinsicGetAllOnes:
2245 assert(!"Get intrinsics should not be seen during Lowering.");
2246 unreached();
2247
2248 default:
2249 noway_assert(!"Unimplemented SIMD node type.");
2250 unreached();
2251 }
2252 if (buildUses)
2253 {
2254 assert(!op1->OperIs(GT_LIST));
2255 assert(srcCount == 0);
2256 // This is overly conservative, but is here for zero diffs.
2257 srcCount = BuildRMWUses(simdTree);
2258 }
2259 buildInternalRegisterUses();
2260 if (dstCount == 1)
2261 {
2262 BuildDef(simdTree, dstCandidates);
2263 }
2264 else
2265 {
2266 assert(dstCount == 0);
2267 }
2268 return srcCount;
2269}
2270#endif // FEATURE_SIMD
2271
2272#ifdef FEATURE_HW_INTRINSICS
2273//------------------------------------------------------------------------
2274// BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2275//
2276// Arguments:
2277// tree - The GT_HWIntrinsic node of interest
2278//
2279// Return Value:
2280// The number of sources consumed by this node.
2281//
2282int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2283{
2284 NamedIntrinsic intrinsicId = intrinsicTree->gtHWIntrinsicId;
2285 var_types baseType = intrinsicTree->gtSIMDBaseType;
2286 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
2287 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
2288 int numArgs = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
2289
2290 if ((isa == InstructionSet_AVX) || (isa == InstructionSet_AVX2))
2291 {
2292 SetContainsAVXFlags(true, 32);
2293 }
2294
2295 GenTree* op1 = intrinsicTree->gtGetOp1();
2296 GenTree* op2 = intrinsicTree->gtGetOp2();
2297 GenTree* op3 = nullptr;
2298 GenTree* lastOp = nullptr;
2299
2300 int srcCount = 0;
2301 int dstCount = intrinsicTree->IsValue() ? 1 : 0;
2302
2303 regMaskTP dstCandidates = RBM_NONE;
2304
2305 if (op1 == nullptr)
2306 {
2307 assert(op2 == nullptr);
2308 assert(numArgs == 0);
2309 }
2310 else
2311 {
2312 if (op1->OperIsList())
2313 {
2314 assert(op2 == nullptr);
2315 assert(numArgs >= 3);
2316
2317 GenTreeArgList* argList = op1->AsArgList();
2318
2319 op1 = argList->Current();
2320 argList = argList->Rest();
2321
2322 op2 = argList->Current();
2323 argList = argList->Rest();
2324
2325 op3 = argList->Current();
2326
2327 while (argList->Rest() != nullptr)
2328 {
2329 argList = argList->Rest();
2330 }
2331
2332 lastOp = argList->Current();
2333 argList = argList->Rest();
2334
2335 assert(argList == nullptr);
2336 }
2337 else if (op2 != nullptr)
2338 {
2339 assert(numArgs == 2);
2340 lastOp = op2;
2341 }
2342 else
2343 {
2344 assert(numArgs == 1);
2345 lastOp = op1;
2346 }
2347
2348 assert(lastOp != nullptr);
2349
2350 bool buildUses = true;
2351
2352 if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
2353 {
2354 if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
2355 {
2356 assert(!lastOp->IsCnsIntOrI());
2357
2358 // We need two extra reg when lastOp isn't a constant so
2359 // the offset into the jump table for the fallback path
2360 // can be computed.
2361 buildInternalIntRegisterDefForNode(intrinsicTree);
2362 buildInternalIntRegisterDefForNode(intrinsicTree);
2363 }
2364 }
2365
2366 // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
2367 // is not allocated the same register as the target.
2368 bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
2369
2370 // Create internal temps, and handle any other special requirements.
2371 // Note that the default case for building uses will handle the RMW flag, but if the uses
2372 // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
2373 // must be handled within the case.
2374 switch (intrinsicId)
2375 {
2376 case NI_Base_Vector128_CreateScalarUnsafe:
2377 case NI_Base_Vector128_ToScalar:
2378 case NI_Base_Vector256_CreateScalarUnsafe:
2379 case NI_Base_Vector256_ToScalar:
2380 {
2381 assert(numArgs == 1);
2382
2383 if (varTypeIsFloating(baseType))
2384 {
2385 if (op1->isContained())
2386 {
2387 srcCount += BuildOperandUses(op1);
2388 }
2389 else
2390 {
2391 // We will either be in memory and need to be moved
2392 // into a register of the appropriate size or we
2393 // are already in an XMM/YMM register and can stay
2394 // where we are.
2395
2396 tgtPrefUse = BuildUse(op1);
2397 srcCount += 1;
2398 }
2399
2400 buildUses = false;
2401 }
2402 break;
2403 }
2404
2405 case NI_Base_Vector128_ToVector256:
2406 case NI_Base_Vector128_ToVector256Unsafe:
2407 case NI_Base_Vector256_GetLower:
2408 {
2409 assert(numArgs == 1);
2410
2411 if (op1->isContained())
2412 {
2413 srcCount += BuildOperandUses(op1);
2414 }
2415 else
2416 {
2417 // We will either be in memory and need to be moved
2418 // into a register of the appropriate size or we
2419 // are already in an XMM/YMM register and can stay
2420 // where we are.
2421
2422 tgtPrefUse = BuildUse(op1);
2423 srcCount += 1;
2424 }
2425
2426 buildUses = false;
2427 break;
2428 }
2429
2430 case NI_SSE_CompareEqualOrderedScalar:
2431 case NI_SSE_CompareEqualUnorderedScalar:
2432 case NI_SSE_CompareNotEqualOrderedScalar:
2433 case NI_SSE_CompareNotEqualUnorderedScalar:
2434 case NI_SSE2_CompareEqualOrderedScalar:
2435 case NI_SSE2_CompareEqualUnorderedScalar:
2436 case NI_SSE2_CompareNotEqualOrderedScalar:
2437 case NI_SSE2_CompareNotEqualUnorderedScalar:
2438 {
2439 buildInternalIntRegisterDefForNode(intrinsicTree, allByteRegs());
2440 setInternalRegsDelayFree = true;
2441 break;
2442 }
2443
2444 case NI_SSE2_MaskMove:
2445 {
2446 assert(numArgs == 3);
2447 assert(!isRMW);
2448
2449 // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2450 srcCount += BuildOperandUses(op1);
2451 srcCount += BuildOperandUses(op2);
2452 srcCount += BuildOperandUses(op3, RBM_EDI);
2453
2454 buildUses = false;
2455 break;
2456 }
2457
2458 case NI_SSE41_BlendVariable:
2459 {
2460 assert(numArgs == 3);
2461
2462 if (!compiler->canUseVexEncoding())
2463 {
2464 assert(isRMW);
2465
2466 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2467 srcCount += BuildOperandUses(op1);
2468 srcCount += BuildDelayFreeUses(op2);
2469 srcCount += BuildDelayFreeUses(op3, RBM_XMM0);
2470
2471 buildUses = false;
2472 }
2473 break;
2474 }
2475
2476 case NI_SSE41_TestAllOnes:
2477 {
2478 buildInternalFloatRegisterDefForNode(intrinsicTree);
2479 break;
2480 }
2481
2482 case NI_SSE41_Extract:
2483 {
2484 if (baseType == TYP_FLOAT)
2485 {
2486 buildInternalIntRegisterDefForNode(intrinsicTree);
2487 }
2488#ifdef _TARGET_X86_
2489 else if (varTypeIsByte(baseType))
2490 {
2491 dstCandidates = allByteRegs();
2492 }
2493#endif
2494 break;
2495 }
2496
2497#ifdef _TARGET_X86_
2498 case NI_SSE42_Crc32:
2499 case NI_SSE42_X64_Crc32:
2500 {
2501 // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
2502 // to the code generator. We may want to encode the overload info in another way.
2503
2504 assert(numArgs == 2);
2505 assert(isRMW);
2506
2507 // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2508 srcCount += BuildOperandUses(op1);
2509 srcCount += BuildDelayFreeUses(op2, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
2510
2511 buildUses = false;
2512 break;
2513 }
2514#endif // _TARGET_X86_
2515
2516 case NI_BMI2_MultiplyNoFlags:
2517 case NI_BMI2_X64_MultiplyNoFlags:
2518 {
2519 assert(numArgs == 2 || numArgs == 3);
2520 srcCount += BuildOperandUses(op1, RBM_EDX);
2521 srcCount += BuildOperandUses(op2);
2522 if (numArgs == 3)
2523 {
2524 // op3 reg should be different from target reg to
2525 // store the lower half result after executing the instruction
2526 srcCount += BuildDelayFreeUses(op3);
2527 // Need a internal register different from the dst to take the lower half result
2528 buildInternalIntRegisterDefForNode(intrinsicTree);
2529 setInternalRegsDelayFree = true;
2530 }
2531 buildUses = false;
2532 break;
2533 }
2534
2535 case NI_FMA_MultiplyAdd:
2536 case NI_FMA_MultiplyAddNegated:
2537 case NI_FMA_MultiplyAddNegatedScalar:
2538 case NI_FMA_MultiplyAddScalar:
2539 case NI_FMA_MultiplyAddSubtract:
2540 case NI_FMA_MultiplySubtract:
2541 case NI_FMA_MultiplySubtractAdd:
2542 case NI_FMA_MultiplySubtractNegated:
2543 case NI_FMA_MultiplySubtractNegatedScalar:
2544 case NI_FMA_MultiplySubtractScalar:
2545 {
2546 assert(numArgs == 3);
2547 assert(isRMW);
2548
2549 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2550
2551 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2552 assert(!copiesUpperBits || !op1->isContained());
2553
2554 if (op3->isContained())
2555 {
2556 // 213 form: op1 = (op2 * op1) + [op3]
2557
2558 if (copiesUpperBits)
2559 {
2560 tgtPrefUse = BuildUse(op1);
2561
2562 srcCount += 1;
2563 srcCount += BuildDelayFreeUses(op2);
2564 }
2565 else
2566 {
2567 // op1 and op2 are commutative, so don't
2568 // set either to be tgtPref or delayFree
2569
2570 srcCount += BuildOperandUses(op1);
2571 srcCount += BuildOperandUses(op2);
2572 }
2573
2574 srcCount += BuildOperandUses(op3);
2575 }
2576 else if (op2->isContained())
2577 {
2578 // 132 form: op1 = (op1 * op3) + [op2]
2579
2580 tgtPrefUse = BuildUse(op1);
2581
2582 srcCount += 1;
2583 srcCount += BuildOperandUses(op2);
2584 srcCount += BuildDelayFreeUses(op3);
2585 }
2586 else if (op1->isContained())
2587 {
2588 // 231 form: op3 = (op2 * op3) + [op1]
2589
2590 tgtPrefUse = BuildUse(op3);
2591
2592 srcCount += BuildOperandUses(op1);
2593 srcCount += BuildDelayFreeUses(op2);
2594 srcCount += 1;
2595 }
2596 else
2597 {
2598 // 213 form: op1 = (op2 * op1) + op3
2599
2600 if (copiesUpperBits)
2601 {
2602 tgtPrefUse = BuildUse(op1);
2603
2604 srcCount += 1;
2605 srcCount += BuildDelayFreeUses(op2);
2606 }
2607 else
2608 {
2609 // op1 and op2 are commutative, so don't
2610 // set either to be tgtPref or delayFree
2611
2612 srcCount += BuildOperandUses(op1);
2613 srcCount += BuildOperandUses(op2);
2614 }
2615
2616 srcCount += BuildDelayFreeUses(op3);
2617 }
2618
2619 buildUses = false;
2620 break;
2621 }
2622
2623 case NI_AVX2_GatherVector128:
2624 case NI_AVX2_GatherVector256:
2625 {
2626 assert(numArgs == 3);
2627 // Any pair of the index, mask, or destination registers should be different
2628 srcCount += BuildOperandUses(op1);
2629 srcCount += BuildDelayFreeUses(op2);
2630
2631 // get a tmp register for mask that will be cleared by gather instructions
2632 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2633 setInternalRegsDelayFree = true;
2634
2635 buildUses = false;
2636 break;
2637 }
2638
2639 case NI_AVX2_GatherMaskVector128:
2640 case NI_AVX2_GatherMaskVector256:
2641 {
2642 assert(numArgs == 5);
2643 // Any pair of the index, mask, or destination registers should be different
2644 srcCount += BuildOperandUses(op1);
2645 srcCount += BuildOperandUses(op2);
2646 srcCount += BuildDelayFreeUses(op3);
2647
2648 assert(intrinsicTree->gtGetOp1()->OperIsList());
2649 GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList();
2650 GenTree* op4 = argList->Rest()->Rest()->Rest()->Current();
2651 srcCount += BuildDelayFreeUses(op4);
2652
2653 // get a tmp register for mask that will be cleared by gather instructions
2654 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2655 setInternalRegsDelayFree = true;
2656
2657 buildUses = false;
2658 break;
2659 }
2660
2661 default:
2662 {
2663 assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
2664 break;
2665 }
2666 }
2667
2668 if (buildUses)
2669 {
2670 assert((numArgs > 0) && (numArgs < 4));
2671
2672 srcCount += BuildOperandUses(op1);
2673
2674 if (op2 != nullptr)
2675 {
2676 srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
2677
2678 if (op3 != nullptr)
2679 {
2680 srcCount += (isRMW) ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
2681 }
2682 }
2683 }
2684
2685 buildInternalRegisterUses();
2686 }
2687
2688 if (dstCount == 1)
2689 {
2690 BuildDef(intrinsicTree, dstCandidates);
2691 }
2692 else
2693 {
2694 assert(dstCount == 0);
2695 }
2696
2697 return srcCount;
2698}
2699#endif
2700
2701//------------------------------------------------------------------------
2702// BuildCast: Set the NodeInfo for a GT_CAST.
2703//
2704// Arguments:
2705// cast - The GT_CAST node
2706//
2707// Return Value:
2708// The number of sources consumed by this node.
2709//
2710int LinearScan::BuildCast(GenTreeCast* cast)
2711{
2712 GenTree* src = cast->gtGetOp1();
2713
2714 const var_types srcType = genActualType(src->TypeGet());
2715 const var_types castType = cast->gtCastType;
2716
2717 regMaskTP candidates = RBM_NONE;
2718#ifdef _TARGET_X86_
2719 if (varTypeIsByte(castType))
2720 {
2721 candidates = allByteRegs();
2722 }
2723
2724 assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained()));
2725#else
2726 // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary
2727 // register to extract the upper 32 bits of the 64 bit source register.
2728 if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT))
2729 {
2730 // Here we don't need internal register to be different from targetReg,
2731 // rather require it to be different from operand's reg.
2732 buildInternalIntRegisterDefForNode(cast);
2733 }
2734#endif
2735
2736 int srcCount = BuildOperandUses(src, candidates);
2737 buildInternalRegisterUses();
2738 BuildDef(cast, candidates);
2739 return srcCount;
2740}
2741
2742//-----------------------------------------------------------------------------------------
2743// BuildIndir: Specify register requirements for address expression of an indirection operation.
2744//
2745// Arguments:
2746// indirTree - GT_IND or GT_STOREIND gentree node
2747//
2748// Return Value:
2749// The number of sources consumed by this node.
2750//
2751int LinearScan::BuildIndir(GenTreeIndir* indirTree)
2752{
2753 // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2754 // it has no register requirements.
2755 if (indirTree->TypeGet() == TYP_STRUCT)
2756 {
2757 return 0;
2758 }
2759
2760#ifdef FEATURE_SIMD
2761 RefPosition* internalFloatDef = nullptr;
2762 if (indirTree->TypeGet() == TYP_SIMD12)
2763 {
2764 // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2765 assert(!indirTree->Addr()->isContained());
2766
2767 // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2768 // To assemble the vector properly we would need an additional
2769 // XMM register.
2770 internalFloatDef = buildInternalFloatRegisterDefForNode(indirTree);
2771
2772 // In case of GT_IND we need an internal register different from targetReg and
2773 // both of the registers are used at the same time.
2774 if (indirTree->OperGet() == GT_IND)
2775 {
2776 setInternalRegsDelayFree = true;
2777 }
2778 }
2779#endif // FEATURE_SIMD
2780
2781 regMaskTP indirCandidates = RBM_NONE;
2782 int srcCount = BuildIndirUses(indirTree, indirCandidates);
2783 if (indirTree->gtOper == GT_STOREIND)
2784 {
2785 GenTree* source = indirTree->gtGetOp2();
2786 if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2787 {
2788 // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2789 // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2790 assert(source->isContained() && source->OperIsRMWMemOp());
2791 GenTree* nonMemSource = nullptr;
2792 GenTreeIndir* otherIndir = nullptr;
2793
2794 if (source->OperIsShiftOrRotate())
2795 {
2796 srcCount += BuildShiftRotate(source);
2797 }
2798 else
2799 {
2800 regMaskTP srcCandidates = RBM_NONE;
2801
2802#ifdef _TARGET_X86_
2803 // Determine if we need byte regs for the non-mem source, if any.
2804 // Note that BuildShiftRotate (above) will handle the byte requirement as needed,
2805 // but STOREIND isn't itself an RMW op, so we have to explicitly set it for that case.
2806
2807 GenTree* nonMemSource = nullptr;
2808
2809 if (indirTree->AsStoreInd()->IsRMWDstOp1())
2810 {
2811 otherIndir = source->gtGetOp1()->AsIndir();
2812 if (source->OperIsBinary())
2813 {
2814 nonMemSource = source->gtGetOp2();
2815 }
2816 }
2817 else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2818 {
2819 otherIndir = source->gtGetOp2()->AsIndir();
2820 nonMemSource = source->gtGetOp1();
2821 }
2822 if ((nonMemSource != nullptr) && !nonMemSource->isContained() && varTypeIsByte(indirTree))
2823 {
2824 srcCandidates = RBM_BYTE_REGS;
2825 }
2826#endif
2827 if (otherIndir != nullptr)
2828 {
2829 // Any lclVars in the addressing mode of this indirection are contained.
2830 // If they are marked as lastUse, transfer the last use flag to the store indir.
2831 GenTree* base = otherIndir->Base();
2832 GenTree* dstBase = indirTree->Base();
2833 CheckAndMoveRMWLastUse(base, dstBase);
2834 GenTree* index = otherIndir->Index();
2835 GenTree* dstIndex = indirTree->Index();
2836 CheckAndMoveRMWLastUse(index, dstIndex);
2837 }
2838 srcCount += BuildBinaryUses(source->AsOp(), srcCandidates);
2839 }
2840 }
2841 else
2842 {
2843#ifdef _TARGET_X86_
2844 if (varTypeIsByte(indirTree) && !source->isContained())
2845 {
2846 BuildUse(source, allByteRegs());
2847 srcCount++;
2848 }
2849 else
2850#endif
2851 {
2852 srcCount += BuildOperandUses(source);
2853 }
2854 }
2855 }
2856#ifdef FEATURE_SIMD
2857 if (varTypeIsSIMD(indirTree))
2858 {
2859 SetContainsAVXFlags(true, genTypeSize(indirTree->TypeGet()));
2860 }
2861 buildInternalRegisterUses();
2862#endif // FEATURE_SIMD
2863
2864 if (indirTree->gtOper != GT_STOREIND)
2865 {
2866 BuildDef(indirTree);
2867 }
2868 return srcCount;
2869}
2870
2871//------------------------------------------------------------------------
2872// BuildMul: Set the NodeInfo for a multiply.
2873//
2874// Arguments:
2875// tree - The node of interest
2876//
2877// Return Value:
2878// The number of sources consumed by this node.
2879//
2880int LinearScan::BuildMul(GenTree* tree)
2881{
2882 assert(tree->OperIsMul());
2883 GenTree* op1 = tree->gtGetOp1();
2884 GenTree* op2 = tree->gtGetOp2();
2885
2886 // Only non-floating point mul has special requirements
2887 if (varTypeIsFloating(tree->TypeGet()))
2888 {
2889 return BuildSimple(tree);
2890 }
2891
2892 int srcCount = BuildBinaryUses(tree->AsOp());
2893 int dstCount = 1;
2894 regMaskTP dstCandidates = RBM_NONE;
2895
2896 bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2897 bool requiresOverflowCheck = tree->gtOverflowEx();
2898
2899 // There are three forms of x86 multiply:
2900 // one-op form: RDX:RAX = RAX * r/m
2901 // two-op form: reg *= r/m
2902 // three-op form: reg = r/m * imm
2903
2904 // This special widening 32x32->64 MUL is not used on x64
2905 CLANG_FORMAT_COMMENT_ANCHOR;
2906#if defined(_TARGET_X86_)
2907 if (tree->OperGet() != GT_MUL_LONG)
2908#endif
2909 {
2910 assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2911 }
2912
2913 // We do use the widening multiply to implement
2914 // the overflow checking for unsigned multiply
2915 //
2916 if (isUnsignedMultiply && requiresOverflowCheck)
2917 {
2918 // The only encoding provided is RDX:RAX = RAX * rm
2919 //
2920 // Here we set RAX as the only destination candidate
2921 // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2922 //
2923 dstCandidates = RBM_RAX;
2924 }
2925 else if (tree->OperGet() == GT_MULHI)
2926 {
2927 // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2928 // upper 32 bits of the result set the destination candidate to REG_RDX.
2929 dstCandidates = RBM_RDX;
2930 }
2931#if defined(_TARGET_X86_)
2932 else if (tree->OperGet() == GT_MUL_LONG)
2933 {
2934 // have to use the encoding:RDX:RAX = RAX * rm
2935 dstCandidates = RBM_RAX | RBM_RDX;
2936 dstCount = 2;
2937 }
2938#endif
2939 GenTree* containedMemOp = nullptr;
2940 if (op1->isContained() && !op1->IsCnsIntOrI())
2941 {
2942 assert(!op2->isContained() || op2->IsCnsIntOrI());
2943 containedMemOp = op1;
2944 }
2945 else if (op2->isContained() && !op2->IsCnsIntOrI())
2946 {
2947 containedMemOp = op2;
2948 }
2949 regMaskTP killMask = getKillSetForMul(tree->AsOp());
2950 BuildDefsWithKills(tree, dstCount, dstCandidates, killMask);
2951 return srcCount;
2952}
2953
2954//------------------------------------------------------------------------------
2955// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2956// Contains256bitAVX flag when SIMD vector size is 32 bytes
2957//
2958// Arguments:
2959// isFloatingPointType - true if it is floating point type
2960// sizeOfSIMDVector - SIMD Vector size
2961//
2962void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2963{
2964 if (isFloatingPointType && compiler->canUseVexEncoding())
2965 {
2966 compiler->getEmitter()->SetContainsAVX(true);
2967 if (sizeOfSIMDVector == 32)
2968 {
2969 compiler->getEmitter()->SetContains256bitAVX(true);
2970 }
2971 }
2972}
2973
2974#endif // _TARGET_XARCH_
2975