1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Lowering for AMD64, x86 XX
9XX XX
10XX This encapsulates all the logic for lowering trees for the AMD64 XX
11XX architecture. For a more detailed view of what is lowering, please XX
12XX take a look at Lower.cpp XX
13XX XX
14XX XX
15XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
17*/
18
19#include "jitpch.h"
20#ifdef _MSC_VER
21#pragma hdrstop
22#endif
23
24#ifdef _TARGET_XARCH_ // This file is only used for xarch
25
26#include "jit.h"
27#include "sideeffects.h"
28#include "lower.h"
29
30// xarch supports both ROL and ROR instructions so no lowering is required.
31void Lowering::LowerRotate(GenTree* tree)
32{
33 ContainCheckShiftRotate(tree->AsOp());
34}
35
36//------------------------------------------------------------------------
37// LowerStoreLoc: Lower a store of a lclVar
38//
39// Arguments:
40// storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
41//
42// Notes:
43// This involves:
44// - Handling of contained immediates.
45// - Widening operations of unsigneds.
46
47void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
48{
49 // Try to widen the ops if they are going into a local var.
50 if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
51 {
52 GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
53 ssize_t ival = con->gtIconVal;
54
55 unsigned varNum = storeLoc->gtLclNum;
56 LclVarDsc* varDsc = comp->lvaTable + varNum;
57
58 if (varDsc->lvIsSIMDType())
59 {
60 noway_assert(storeLoc->gtType != TYP_STRUCT);
61 }
62 unsigned size = genTypeSize(storeLoc);
63 // If we are storing a constant into a local variable
64 // we extend the size of the store here
65 if ((size < 4) && !varTypeIsStruct(varDsc))
66 {
67 if (!varTypeIsUnsigned(varDsc))
68 {
69 if (genTypeSize(storeLoc) == 1)
70 {
71 if ((ival & 0x7f) != ival)
72 {
73 ival = ival | 0xffffff00;
74 }
75 }
76 else
77 {
78 assert(genTypeSize(storeLoc) == 2);
79 if ((ival & 0x7fff) != ival)
80 {
81 ival = ival | 0xffff0000;
82 }
83 }
84 }
85
86 // A local stack slot is at least 4 bytes in size, regardless of
87 // what the local var is typed as, so auto-promote it here
88 // unless it is a field of a promoted struct
89 // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
90 if (!varDsc->lvIsStructField)
91 {
92 storeLoc->gtType = TYP_INT;
93 con->SetIconValue(ival);
94 }
95 }
96 }
97 if (storeLoc->OperIs(GT_STORE_LCL_FLD))
98 {
99 // We should only encounter this for lclVars that are lvDoNotEnregister.
100 verifyLclFldDoNotEnregister(storeLoc->gtLclNum);
101 }
102 ContainCheckStoreLoc(storeLoc);
103}
104
105//------------------------------------------------------------------------
106// LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
107//
108// Arguments:
109// node - The indirect store node (GT_STORE_IND) of interest
110//
111// Return Value:
112// None.
113//
114void Lowering::LowerStoreIndir(GenTreeIndir* node)
115{
116 // Mark all GT_STOREIND nodes to indicate that it is not known
117 // whether it represents a RMW memory op.
118 node->AsStoreInd()->SetRMWStatusDefault();
119
120 if (!varTypeIsFloating(node))
121 {
122 // Perform recognition of trees with the following structure:
123 // StoreInd(addr, BinOp(expr, GT_IND(addr)))
124 // to be able to fold this into an instruction of the form
125 // BINOP [addr], register
126 // where register is the actual place where 'expr' is computed.
127 //
128 // SSE2 doesn't support RMW form of instructions.
129 if (LowerRMWMemOp(node))
130 {
131 return;
132 }
133 }
134 ContainCheckStoreIndir(node);
135}
136
137//------------------------------------------------------------------------
138// LowerBlockStore: Set block store type
139//
140// Arguments:
141// blkNode - The block store node of interest
142//
143// Return Value:
144// None.
145//
146void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
147{
148 GenTree* dstAddr = blkNode->Addr();
149 unsigned size = blkNode->gtBlkSize;
150 GenTree* source = blkNode->Data();
151 Compiler* compiler = comp;
152 GenTree* srcAddrOrFill = nullptr;
153 bool isInitBlk = blkNode->OperIsInitBlkOp();
154
155 if (!isInitBlk)
156 {
157 // CopyObj or CopyBlk
158 if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
159 {
160 blkNode->SetOper(GT_STORE_BLK);
161 }
162 if (source->gtOper == GT_IND)
163 {
164 srcAddrOrFill = blkNode->Data()->gtGetOp1();
165 }
166 }
167
168 if (isInitBlk)
169 {
170 GenTree* initVal = source;
171 if (initVal->OperIsInitVal())
172 {
173 initVal->SetContained();
174 initVal = initVal->gtGetOp1();
175 }
176 srcAddrOrFill = initVal;
177 // If we have an InitBlk with constant block size we can optimize several ways:
178 // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
179 // we use rep stosb since this reduces the register pressure in LSRA and we have
180 // roughly the same performance as calling the helper.
181 // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
182 // we can speed this up by unrolling the loop using SSE2 stores. The reason for
183 // this threshold is because our last investigation (Fall 2013), more than 95% of initblks
184 // in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
185 // preferred code sequence for the vast majority of cases.
186
187 // This threshold will decide from using the helper or let the JIT decide to inline
188 // a code sequence of its choice.
189 unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
190
191 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
192 if (size != 0 && size <= helperThreshold)
193 {
194 // Always favor unrolling vs rep stos.
195 if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
196 {
197 // The fill value of an initblk is interpreted to hold a
198 // value of (unsigned int8) however a constant of any size
199 // may practically reside on the evaluation stack. So extract
200 // the lower byte out of the initVal constant and replicate
201 // it to a larger constant whose size is sufficient to support
202 // the largest width store of the desired inline expansion.
203
204 ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
205#ifdef _TARGET_AMD64_
206 if (size < REGSIZE_BYTES)
207 {
208 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
209 }
210 else
211 {
212 initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
213 initVal->gtType = TYP_LONG;
214 if ((fill == 0) && ((size & 0xf) == 0))
215 {
216 MakeSrcContained(blkNode, source);
217 }
218 }
219#else // !_TARGET_AMD64_
220 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
221#endif // !_TARGET_AMD64_
222
223 if ((fill == 0) && ((size & 0xf) == 0))
224 {
225 MakeSrcContained(blkNode, source);
226 }
227 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
228 }
229 else
230 {
231 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
232 }
233 }
234 else
235 {
236#ifdef _TARGET_AMD64_
237 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
238#else // !_TARGET_AMD64_
239 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
240#endif // !_TARGET_AMD64_
241 }
242 }
243 else
244 {
245 if (blkNode->gtOper == GT_STORE_OBJ)
246 {
247 // CopyObj
248
249 GenTreeObj* cpObjNode = blkNode->AsObj();
250
251 unsigned slots = cpObjNode->gtSlots;
252
253#ifdef DEBUG
254 // CpObj must always have at least one GC-Pointer as a member.
255 assert(cpObjNode->gtGcPtrCount > 0);
256
257 assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
258
259 CORINFO_CLASS_HANDLE clsHnd = cpObjNode->gtClass;
260 size_t classSize = comp->info.compCompHnd->getClassSize(clsHnd);
261 size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
262
263 // Currently, the EE always round up a class data structure so
264 // we are not handling the case where we have a non multiple of pointer sized
265 // struct. This behavior may change in the future so in order to keeps things correct
266 // let's assert it just to be safe. Going forward we should simply
267 // handle this case.
268 assert(classSize == blkSize);
269 assert((blkSize / TARGET_POINTER_SIZE) == slots);
270 assert(cpObjNode->HasGCPtr());
271#endif
272
273 bool IsRepMovsProfitable = false;
274
275 // If the destination is not on the stack, let's find out if we
276 // can improve code size by using rep movsq instead of generating
277 // sequences of movsq instructions.
278 if (!dstAddr->OperIsLocalAddr())
279 {
280 // Let's inspect the struct/class layout and determine if it's profitable
281 // to use rep movsq for copying non-gc memory instead of using single movsq
282 // instructions for each memory slot.
283 unsigned i = 0;
284 BYTE* gcPtrs = cpObjNode->gtGcPtrs;
285
286 do
287 {
288 unsigned nonGCSlots = 0;
289 // Measure a contiguous non-gc area inside the struct and note the maximum.
290 while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
291 {
292 nonGCSlots++;
293 i++;
294 }
295
296 while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
297 {
298 i++;
299 }
300
301 if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
302 {
303 IsRepMovsProfitable = true;
304 break;
305 }
306 } while (i < slots);
307 }
308 else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
309 {
310 IsRepMovsProfitable = true;
311 }
312
313 // There are two cases in which we need to materialize the
314 // struct size:
315 // a) When the destination is on the stack we don't need to use the
316 // write barrier, we can just simply call rep movsq and get a win in codesize.
317 // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
318 // to use rep movsq instead of a sequence of single movsq instructions. According to the
319 // Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
320 // the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
321 if (IsRepMovsProfitable)
322 {
323 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
324 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
325 }
326 else
327 {
328 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
329 }
330 }
331 else
332 {
333 assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
334 // CopyBlk
335 // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
336 // we can use rep movs to generate code instead of the helper call.
337
338 // This threshold will decide between using the helper or let the JIT decide to inline
339 // a code sequence of its choice.
340 unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
341
342 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
343 if ((size != 0) && (size <= helperThreshold))
344 {
345 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
346 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
347 // our framework assemblies, so this is the main code generation scheme we'll use.
348 if (size <= CPBLK_UNROLL_LIMIT)
349 {
350 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
351
352 // If src or dst are on stack, we don't have to generate the address
353 // into a register because it's just some constant+SP.
354 if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
355 {
356 MakeSrcContained(blkNode, srcAddrOrFill);
357 }
358
359 if (dstAddr->OperIsLocalAddr())
360 {
361 MakeSrcContained(blkNode, dstAddr);
362 }
363 }
364 else
365 {
366 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
367 }
368 }
369#ifdef _TARGET_AMD64_
370 else
371 {
372 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
373 }
374#elif defined(_TARGET_X86_)
375 else
376 {
377 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
378 }
379#endif // _TARGET_X86_
380 assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
381 }
382
383 // CopyObj or CopyBlk
384 if (source->gtOper == GT_IND)
385 {
386 // The GT_IND is contained, but the address must be in a register unless it is local.
387 MakeSrcContained(blkNode, source);
388 GenTree* addr = source->AsIndir()->Addr();
389 if (!addr->OperIsLocalAddr())
390 {
391 addr->ClearContained();
392 }
393 }
394 else if (!source->IsMultiRegCall() && !source->OperIsSIMD() && !source->OperIsSimdHWIntrinsic())
395 {
396 assert(source->IsLocal());
397 MakeSrcContained(blkNode, source);
398 }
399 }
400}
401
402//------------------------------------------------------------------------
403// LowerPutArgStk: Lower a GT_PUTARG_STK.
404//
405// Arguments:
406// tree - The node of interest
407//
408// Return Value:
409// None.
410//
411void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
412{
413#ifdef _TARGET_X86_
414 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
415 {
416 putArgStk->gtNumberReferenceSlots = 0;
417 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
418
419 GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
420
421 // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
422 // of uses is visible to LSRA.
423 unsigned fieldCount = 0;
424 GenTreeFieldList* head = nullptr;
425 for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
426 {
427 next = current->Rest();
428
429 // First, insert the field node into the sorted list.
430 GenTreeFieldList* prev = nullptr;
431 for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
432 {
433 // If the offset of the current list node is greater than the offset of the cursor or if we have
434 // reached the end of the list, insert the current node before the cursor and terminate.
435 if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
436 {
437 if (prev == nullptr)
438 {
439 assert(cursor == head);
440 head = current;
441 }
442 else
443 {
444 prev->Rest() = current;
445 }
446
447 current->Rest() = cursor;
448 break;
449 }
450 }
451
452 fieldCount++;
453 }
454
455 // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
456 // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
457 // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
458 // corresponding field list nodes in two, giving an upper bound of 8.
459 //
460 // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
461 // the maximum size of a field list grows significantly, we will need to reevaluate it.
462 assert(fieldCount <= 8);
463
464 // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
465 // necessary.
466 if (head != fieldList)
467 {
468 head->gtFlags |= GTF_FIELD_LIST_HEAD;
469 head->SetContained();
470
471 fieldList->ClearContained();
472 fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
473
474#ifdef DEBUG
475 head->gtSeqNum = fieldList->gtSeqNum;
476#endif // DEBUG
477
478 BlockRange().InsertAfter(fieldList, head);
479 BlockRange().Remove(fieldList);
480
481 fieldList = head;
482 putArgStk->gtOp1 = fieldList;
483 putArgStk->gtType = fieldList->gtType;
484 }
485
486 // Now that the fields have been sorted, the kind of code we will generate.
487 bool allFieldsAreSlots = true;
488 unsigned prevOffset = putArgStk->getArgSize();
489 for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
490 {
491 GenTree* const fieldNode = current->Current();
492 const var_types fieldType = fieldNode->TypeGet();
493 const unsigned fieldOffset = current->gtFieldOffset;
494 assert(fieldType != TYP_LONG);
495
496 // We can treat as a slot any field that is stored at a slot boundary, where the previous
497 // field is not in the same slot. (Note that we store the fields in reverse order.)
498 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
499 if (!fieldIsSlot)
500 {
501 allFieldsAreSlots = false;
502 }
503
504 if (varTypeIsGC(fieldType))
505 {
506 putArgStk->gtNumberReferenceSlots++;
507 }
508
509 // For x86 we must mark all integral fields as contained or reg-optional, and handle them
510 // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
511 // registers to be consumed atomically by the call.
512 if (varTypeIsIntegralOrI(fieldNode))
513 {
514 if (fieldNode->OperGet() == GT_LCL_VAR)
515 {
516 LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
517 if (!varDsc->lvDoNotEnregister)
518 {
519 fieldNode->SetRegOptional();
520 }
521 else
522 {
523 MakeSrcContained(putArgStk, fieldNode);
524 }
525 }
526 else if (fieldNode->IsIntCnsFitsInI32())
527 {
528 MakeSrcContained(putArgStk, fieldNode);
529 }
530 else
531 {
532 // For the case where we cannot directly push the value, if we run out of registers,
533 // it would be better to defer computation until we are pushing the arguments rather
534 // than spilling, but this situation is not all that common, as most cases of promoted
535 // structs do not have a large number of fields, and of those most are lclVars or
536 // copy-propagated constants.
537 fieldNode->SetRegOptional();
538 }
539 }
540
541 prevOffset = fieldOffset;
542 }
543
544 // Set the copy kind.
545 // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
546 // adjust the stack once for those fields. The latter is really best done in code generation, but
547 // this tuning should probably be undertaken as a whole.
548 // Also, if there are floating point fields, it may be better to use the "Unroll" mode
549 // of copying the struct as a whole, if the fields are not register candidates.
550 if (allFieldsAreSlots)
551 {
552 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
553 }
554 else
555 {
556 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
557 }
558 return;
559 }
560#endif // _TARGET_X86_
561
562 GenTree* src = putArgStk->gtOp1;
563
564#ifdef FEATURE_PUT_STRUCT_ARG_STK
565 if (src->TypeGet() != TYP_STRUCT)
566#endif // FEATURE_PUT_STRUCT_ARG_STK
567 {
568 // If the child of GT_PUTARG_STK is a constant, we don't need a register to
569 // move it to memory (stack location).
570 //
571 // On AMD64, we don't want to make 0 contained, because we can generate smaller code
572 // by zeroing a register and then storing it. E.g.:
573 // xor rdx, rdx
574 // mov gword ptr [rsp+28H], rdx
575 // is 2 bytes smaller than:
576 // mov gword ptr [rsp+28H], 0
577 //
578 // On x86, we push stack arguments; we don't use 'mov'. So:
579 // push 0
580 // is 1 byte smaller than:
581 // xor rdx, rdx
582 // push rdx
583
584 if (IsContainableImmed(putArgStk, src)
585#if defined(_TARGET_AMD64_)
586 && !src->IsIntegralConst(0)
587#endif // _TARGET_AMD64_
588 )
589 {
590 MakeSrcContained(putArgStk, src);
591 }
592 return;
593 }
594
595#ifdef FEATURE_PUT_STRUCT_ARG_STK
596 GenTree* dst = putArgStk;
597 GenTree* srcAddr = nullptr;
598
599 bool haveLocalAddr = false;
600 if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
601 {
602 srcAddr = src->gtOp.gtOp1;
603 assert(srcAddr != nullptr);
604 haveLocalAddr = srcAddr->OperIsLocalAddr();
605 }
606 else
607 {
608 assert(varTypeIsSIMD(putArgStk));
609 }
610
611 // In case of a CpBlk we could use a helper call. In case of putarg_stk we
612 // can't do that since the helper call could kill some already set up outgoing args.
613 // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
614 // The cpyXXXX code is rather complex and this could cause it to be more complex, but
615 // it might be the right thing to do.
616
617 // This threshold will decide from using the helper or let the JIT decide to inline
618 // a code sequence of its choice.
619 ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
620 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
621
622 // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
623 // (I don't know which).
624
625 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
626 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
627 // our framework assemblies, so this is the main code generation scheme we'll use.
628 if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
629 {
630#ifdef _TARGET_X86_
631 if (size < XMM_REGSIZE_BYTES)
632 {
633 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
634 }
635 else
636#endif // _TARGET_X86_
637 {
638 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
639 }
640 }
641#ifdef _TARGET_X86_
642 else if (putArgStk->gtNumberReferenceSlots != 0)
643 {
644 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
645 // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
646 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
647 }
648#endif // _TARGET_X86_
649 else
650 {
651 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
652 }
653 // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
654 MakeSrcContained(putArgStk, src);
655 if (haveLocalAddr)
656 {
657 // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
658 // copies.
659 //
660 MakeSrcContained(putArgStk, srcAddr);
661 }
662#endif // FEATURE_PUT_STRUCT_ARG_STK
663}
664
665/* Lower GT_CAST(srcType, DstType) nodes.
666 *
667 * Casts from small int type to float/double are transformed as follows:
668 * GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double)
669 * GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double)
670 * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
671 * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
672 *
673 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
674 * are morphed as follows by front-end and hence should not be seen here.
675 * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
676 * GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
677 *
678 *
679 * Similarly casts from float/double to a smaller int type are transformed as follows:
680 * GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte)
681 * GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte)
682 * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
683 * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
684 *
685 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
686 * integer. The above transformations help us to leverage those instructions.
687 *
688 * Note that for the following conversions we still depend on helper calls and
689 * don't expect to see them here.
690 * i) GT_CAST(float/double, uint64)
691 * ii) GT_CAST(float/double, int type with overflow detection)
692 *
693 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
694 * There are hardly any occurrences of this conversion operation in platform
695 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
696 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
697 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
698 * doing this optimization is a win, should consider generating in-lined code.
699 */
700void Lowering::LowerCast(GenTree* tree)
701{
702 assert(tree->OperGet() == GT_CAST);
703
704 GenTree* castOp = tree->gtCast.CastOp();
705 var_types castToType = tree->CastToType();
706 var_types srcType = castOp->TypeGet();
707 var_types tmpType = TYP_UNDEF;
708
709 // force the srcType to unsigned if GT_UNSIGNED flag is set
710 if (tree->gtFlags & GTF_UNSIGNED)
711 {
712 srcType = genUnsignedType(srcType);
713 }
714
715 // We should never see the following casts as they are expected to be lowered
716 // apropriately or converted into helper calls by front-end.
717 // srcType = float/double castToType = * and overflow detecting cast
718 // Reason: must be converted to a helper call
719 // srcType = float/double, castToType = ulong
720 // Reason: must be converted to a helper call
721 // srcType = uint castToType = float/double
722 // Reason: uint -> float/double = uint -> long -> float/double
723 // srcType = ulong castToType = float
724 // Reason: ulong -> float = ulong -> double -> float
725 if (varTypeIsFloating(srcType))
726 {
727 noway_assert(!tree->gtOverflow());
728 noway_assert(castToType != TYP_ULONG);
729 }
730 else if (srcType == TYP_UINT)
731 {
732 noway_assert(!varTypeIsFloating(castToType));
733 }
734 else if (srcType == TYP_ULONG)
735 {
736 noway_assert(castToType != TYP_FLOAT);
737 }
738
739 // Case of src is a small type and dst is a floating point type.
740 if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
741 {
742 // These conversions can never be overflow detecting ones.
743 noway_assert(!tree->gtOverflow());
744 tmpType = TYP_INT;
745 }
746 // case of src is a floating point type and dst is a small type.
747 else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
748 {
749 tmpType = TYP_INT;
750 }
751
752 if (tmpType != TYP_UNDEF)
753 {
754 GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
755 tmp->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
756
757 tree->gtFlags &= ~GTF_UNSIGNED;
758 tree->gtOp.gtOp1 = tmp;
759 BlockRange().InsertAfter(castOp, tmp);
760 ContainCheckCast(tmp->AsCast());
761 }
762
763 // Now determine if we have operands that should be contained.
764 ContainCheckCast(tree->AsCast());
765}
766
767#ifdef FEATURE_SIMD
768//----------------------------------------------------------------------------------------------
769// Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
770//
771// Arguments:
772// simdNode - The SIMD intrinsic node.
773//
774void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
775{
776 if (simdNode->TypeGet() == TYP_SIMD12)
777 {
778 // GT_SIMD node requiring to produce TYP_SIMD12 in fact
779 // produces a TYP_SIMD16 result
780 simdNode->gtType = TYP_SIMD16;
781 }
782
783#ifdef _TARGET_XARCH_
784 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
785 {
786 // If SIMD vector is already in memory, we force its
787 // addr to be evaluated into a reg. This would allow
788 // us to generate [regBase] or [regBase+offset] or
789 // [regBase+sizeOf(SIMD vector baseType)*regIndex]
790 // to access the required SIMD vector element directly
791 // from memory.
792 //
793 // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
794 // might be able update GT_LEA to fold the regIndex
795 // or offset in some cases. Instead with this
796 // approach we always evaluate GT_LEA into a reg.
797 // Ideally, we should be able to lower GetItem intrinsic
798 // into GT_IND(newAddr) where newAddr combines
799 // the addr of SIMD vector with the given index.
800 simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
801 }
802 else if (simdNode->IsSIMDEqualityOrInequality())
803 {
804 LIR::Use simdUse;
805
806 if (BlockRange().TryGetUse(simdNode, &simdUse))
807 {
808 //
809 // Try to transform JTRUE(EQ|NE(SIMD<OpEquality|OpInEquality>(x, y), 0|1)) into
810 // JCC(SIMD<OpEquality|OpInEquality>(x, y)). SIMD<OpEquality|OpInEquality>(x, y)
811 // is expected to set the Zero flag appropriately.
812 // All the involved nodes must form a continuous range, there's no other way to
813 // guarantee that condition flags aren't changed between the SIMD node and the JCC
814 // node.
815 //
816
817 bool transformed = false;
818 GenTree* simdUser = simdUse.User();
819
820 if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() &&
821 (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser))
822 {
823 ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue();
824
825 if ((relopOp2Value == 0) || (relopOp2Value == 1))
826 {
827 GenTree* jtrue = simdUser->gtNext;
828
829 if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser))
830 {
831 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ))
832 {
833 relopOp2Value ^= 1;
834 }
835
836 jtrue->ChangeOper(GT_JCC);
837 GenTreeCC* jcc = jtrue->AsCC();
838 jcc->gtFlags |= GTF_USE_FLAGS;
839 jcc->gtCondition = (relopOp2Value == 0) ? GT_NE : GT_EQ;
840
841 BlockRange().Remove(simdUser->gtGetOp2());
842 BlockRange().Remove(simdUser);
843 transformed = true;
844 }
845 }
846 }
847
848 if (!transformed)
849 {
850 //
851 // The code generated for SIMD SIMD<OpEquality|OpInEquality>(x, y) nodes sets
852 // the Zero flag like integer compares do so we can simply use SETCC<EQ|NE>
853 // to produce the desired result. This avoids the need for subsequent phases
854 // to have to handle 2 cases (set flags/set destination register).
855 //
856
857 genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE;
858 GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet());
859 setcc->gtFlags |= GTF_USE_FLAGS;
860 BlockRange().InsertAfter(simdNode, setcc);
861 simdUse.ReplaceWith(comp, setcc);
862 }
863 }
864
865 simdNode->gtFlags |= GTF_SET_FLAGS;
866 simdNode->gtType = TYP_VOID;
867 }
868#endif
869 ContainCheckSIMD(simdNode);
870}
871#endif // FEATURE_SIMD
872
873#ifdef FEATURE_HW_INTRINSICS
874//----------------------------------------------------------------------------------------------
875// Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
876//
877// Arguments:
878// node - The hardware intrinsic node.
879//
880void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
881{
882 ContainCheckHWIntrinsic(node);
883}
884#endif // FEATURE_HW_INTRINSICS
885
886//----------------------------------------------------------------------------------------------
887// Lowering::IsRMWIndirCandidate:
888// Returns true if the given operand is a candidate indirection for a read-modify-write
889// operator.
890//
891// Arguments:
892// operand - The operand to consider.
893// storeInd - The indirect store that roots the possible RMW operator.
894//
895bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
896{
897 // If the operand isn't an indirection, it's trivially not a candidate.
898 if (operand->OperGet() != GT_IND)
899 {
900 return false;
901 }
902
903 // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
904 // indirection is not a candidate.
905 GenTree* srcAddr = operand->gtGetOp1();
906 GenTree* dstAddr = storeInd->gtGetOp1();
907 if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
908 {
909 return false;
910 }
911
912 // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
913 // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
914 // indirection's tree is visited and check the side effects at each point.
915
916 m_scratchSideEffects.Clear();
917
918 assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
919 operand->gtLIRFlags |= LIR::Flags::Mark;
920
921 unsigned markCount = 1;
922 GenTree* node;
923 for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
924 {
925 assert(node != nullptr);
926
927 if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
928 {
929 m_scratchSideEffects.AddNode(comp, node);
930 }
931 else
932 {
933 node->gtLIRFlags &= ~LIR::Flags::Mark;
934 markCount--;
935
936 if (m_scratchSideEffects.InterferesWith(comp, node, false))
937 {
938 // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
939 // not a candidate. Clear any leftover mark bits and return.
940 for (; markCount > 0; node = node->gtPrev)
941 {
942 if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
943 {
944 node->gtLIRFlags &= ~LIR::Flags::Mark;
945 markCount--;
946 }
947 }
948 return false;
949 }
950
951 node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
952 assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
953 nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
954 markCount++;
955 return GenTree::VisitResult::Continue;
956 });
957 }
958 }
959
960 // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
961 // destination address, and that it and the transitive closure of its operand can be safely contained by the
962 // storeIndir. This indirection is therefore a candidate for an RMW op.
963 return true;
964}
965
966//----------------------------------------------------------------------------------------------
967// Returns true if this tree is bin-op of a GT_STOREIND of the following form
968// storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
969// storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
970//
971// The above form for storeInd represents a read-modify-write memory binary operation.
972//
973// Parameters
974// tree - GentreePtr of binOp
975//
976// Return Value
977// True if 'tree' is part of a RMW memory operation pattern
978//
979bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
980{
981 // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
982 assert(!varTypeIsFloating(tree));
983 assert(GenTree::OperIsBinary(tree->OperGet()));
984
985 // Cheap bail out check before more expensive checks are performed.
986 // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
987 if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
988 {
989 return false;
990 }
991
992 LIR::Use use;
993 if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
994 {
995 return false;
996 }
997
998 // Since it is not relatively cheap to recognize RMW memory op pattern, we
999 // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
1000 // we can use the result.
1001 GenTree* indirCandidate = nullptr;
1002 GenTree* indirOpSource = nullptr;
1003 return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
1004}
1005
1006//----------------------------------------------------------------------------------------------
1007// This method recognizes the case where we have a treeNode with the following structure:
1008// storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
1009// storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
1010// storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
1011//
1012// Terminology:
1013// indirDst = memory write of an addr mode (i.e. storeind destination)
1014// indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
1015// indirCandidate = memory read i.e. a gtInd of an addr mode
1016// indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
1017//
1018// In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
1019// following form in case of integer operations:
1020// binOp [addressing mode], RegIndirOpSource
1021// binOp [addressing mode], immediateVal
1022// where RegIndirOpSource is the register where indirOpSource was computed.
1023//
1024// Right now, we recognize few cases:
1025// a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
1026// b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
1027// c) unaryOp is either not/neg
1028//
1029// Implementation Note: The following routines need to be in sync for RMW memory op optimization
1030// to be correct and functional.
1031// IndirsAreEquivalent()
1032// NodesAreEquivalentLeaves()
1033// Codegen of GT_STOREIND and genCodeForShiftRMW()
1034// emitInsRMW()
1035//
1036// TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
1037// package to perform more complex tree recognition.
1038//
1039// TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
1040//
1041// Parameters:
1042// tree - GT_STOREIND node
1043// outIndirCandidate - out param set to indirCandidate as described above
1044// ouutIndirOpSource - out param set to indirOpSource as described above
1045//
1046// Return value
1047// True if there is a RMW memory operation rooted at a GT_STOREIND tree
1048// and out params indirCandidate and indirOpSource are set to non-null values.
1049// Otherwise, returns false with indirCandidate and indirOpSource set to null.
1050// Also updates flags of GT_STOREIND tree with its RMW status.
1051//
1052bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree** outIndirCandidate, GenTree** outIndirOpSource)
1053{
1054 assert(!varTypeIsFloating(tree));
1055 assert(outIndirCandidate != nullptr);
1056 assert(outIndirOpSource != nullptr);
1057
1058 *outIndirCandidate = nullptr;
1059 *outIndirOpSource = nullptr;
1060
1061 // Early out if storeInd is already known to be a non-RMW memory op
1062 GenTreeStoreInd* storeInd = tree->AsStoreInd();
1063 if (storeInd->IsNonRMWMemoryOp())
1064 {
1065 return false;
1066 }
1067
1068 GenTree* indirDst = storeInd->gtGetOp1();
1069 GenTree* indirSrc = storeInd->gtGetOp2();
1070 genTreeOps oper = indirSrc->OperGet();
1071
1072 // Early out if it is already known to be a RMW memory op
1073 if (storeInd->IsRMWMemoryOp())
1074 {
1075 if (GenTree::OperIsBinary(oper))
1076 {
1077 if (storeInd->IsRMWDstOp1())
1078 {
1079 *outIndirCandidate = indirSrc->gtGetOp1();
1080 *outIndirOpSource = indirSrc->gtGetOp2();
1081 }
1082 else
1083 {
1084 assert(storeInd->IsRMWDstOp2());
1085 *outIndirCandidate = indirSrc->gtGetOp2();
1086 *outIndirOpSource = indirSrc->gtGetOp1();
1087 }
1088 assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
1089 }
1090 else
1091 {
1092 assert(GenTree::OperIsUnary(oper));
1093 assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
1094 *outIndirCandidate = indirSrc->gtGetOp1();
1095 *outIndirOpSource = indirSrc->gtGetOp1();
1096 }
1097
1098 return true;
1099 }
1100
1101 // If reached here means that we do not know RMW status of tree rooted at storeInd
1102 assert(storeInd->IsRMWStatusUnknown());
1103
1104 // Early out if indirDst is not one of the supported memory operands.
1105 if (indirDst->OperGet() != GT_LEA && indirDst->OperGet() != GT_LCL_VAR && indirDst->OperGet() != GT_LCL_VAR_ADDR &&
1106 indirDst->OperGet() != GT_CLS_VAR_ADDR && indirDst->OperGet() != GT_CNS_INT)
1107 {
1108 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1109 return false;
1110 }
1111
1112 // We can not use Read-Modify-Write instruction forms with overflow checking instructions
1113 // because we are not allowed to modify the target until after the overflow check.
1114 if (indirSrc->gtOverflowEx())
1115 {
1116 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1117 return false;
1118 }
1119
1120 // At this point we can match one of two patterns:
1121 //
1122 // t_ind = indir t_addr_0
1123 // ...
1124 // t_value = binop t_ind, t_other
1125 // ...
1126 // storeIndir t_addr_1, t_value
1127 //
1128 // or
1129 //
1130 // t_ind = indir t_addr_0
1131 // ...
1132 // t_value = unop t_ind
1133 // ...
1134 // storeIndir t_addr_1, t_value
1135 //
1136 // In all cases, we will eventually make the binop that produces t_value and the entire dataflow tree rooted at
1137 // t_ind contained by t_value.
1138
1139 GenTree* indirCandidate = nullptr;
1140 GenTree* indirOpSource = nullptr;
1141 RMWStatus status = STOREIND_RMW_STATUS_UNKNOWN;
1142 if (GenTree::OperIsBinary(oper))
1143 {
1144 // Return if binary op is not one of the supported operations for RMW of memory.
1145 if (!GenTree::OperIsRMWMemOp(oper))
1146 {
1147 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1148 return false;
1149 }
1150
1151 if (GenTree::OperIsShiftOrRotate(oper) && varTypeIsSmall(storeInd))
1152 {
1153 // In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
1154 // by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
1155 // will lose the sign or zero-extension bits.
1156 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
1157 return false;
1158 }
1159
1160 // In the common case, the second operand to the binop will be the indir candidate.
1161 GenTreeOp* binOp = indirSrc->AsOp();
1162 if (GenTree::OperIsCommutative(oper) && IsRMWIndirCandidate(binOp->gtOp2, storeInd))
1163 {
1164 indirCandidate = binOp->gtOp2;
1165 indirOpSource = binOp->gtOp1;
1166 status = STOREIND_RMW_DST_IS_OP2;
1167 }
1168 else if (IsRMWIndirCandidate(binOp->gtOp1, storeInd))
1169 {
1170 indirCandidate = binOp->gtOp1;
1171 indirOpSource = binOp->gtOp2;
1172 status = STOREIND_RMW_DST_IS_OP1;
1173 }
1174 else
1175 {
1176 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1177 return false;
1178 }
1179 }
1180 else if (GenTree::OperIsUnary(oper))
1181 {
1182 // Nodes other than GT_NOT and GT_NEG are not yet supported.
1183 if (oper != GT_NOT && oper != GT_NEG)
1184 {
1185 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1186 return false;
1187 }
1188
1189 if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
1190 {
1191 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1192 return false;
1193 }
1194
1195 GenTreeUnOp* unOp = indirSrc->AsUnOp();
1196 if (IsRMWIndirCandidate(unOp->gtOp1, storeInd))
1197 {
1198 // src and dest are the same in case of unary ops
1199 indirCandidate = unOp->gtOp1;
1200 indirOpSource = unOp->gtOp1;
1201 status = STOREIND_RMW_DST_IS_OP1;
1202 }
1203 else
1204 {
1205 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1206 return false;
1207 }
1208 }
1209 else
1210 {
1211 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1212 return false;
1213 }
1214
1215 // By this point we've verified that we have a supported operand with a supported address. Now we need to ensure
1216 // that we're able to move the destination address for the source indirection forwards.
1217 if (!IsSafeToContainMem(storeInd, indirDst))
1218 {
1219 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1220 return false;
1221 }
1222
1223 assert(indirCandidate != nullptr);
1224 assert(indirOpSource != nullptr);
1225 assert(status != STOREIND_RMW_STATUS_UNKNOWN);
1226
1227 *outIndirCandidate = indirCandidate;
1228 *outIndirOpSource = indirOpSource;
1229 storeInd->SetRMWStatus(status);
1230 return true;
1231}
1232
1233// anything is in range for AMD64
1234bool Lowering::IsCallTargetInRange(void* addr)
1235{
1236 return true;
1237}
1238
1239// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
1240bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
1241{
1242 if (!childNode->IsIntCnsFitsInI32())
1243 {
1244 return false;
1245 }
1246
1247 // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
1248 // Icons that need relocation should never be marked as contained immed
1249 if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
1250 {
1251 return false;
1252 }
1253
1254 return true;
1255}
1256
1257//-----------------------------------------------------------------------
1258// PreferredRegOptionalOperand: returns one of the operands of given
1259// binary oper that is to be preferred for marking as reg optional.
1260//
1261// Since only one of op1 or op2 can be a memory operand on xarch, only
1262// one of them have to be marked as reg optional. Since Lower doesn't
1263// know apriori which of op1 or op2 is not likely to get a register, it
1264// has to make a guess. This routine encapsulates heuristics that
1265// guess whether it is likely to be beneficial to mark op1 or op2 as
1266// reg optional.
1267//
1268//
1269// Arguments:
1270// tree - a binary-op tree node that is either commutative
1271// or a compare oper.
1272//
1273// Returns:
1274// Returns op1 or op2 of tree node that is preferred for
1275// marking as reg optional.
1276//
1277// Note: if the tree oper is neither commutative nor a compare oper
1278// then only op2 can be reg optional on xarch and hence no need to
1279// call this routine.
1280GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
1281{
1282 assert(GenTree::OperIsBinary(tree->OperGet()));
1283 assert(tree->OperIsCommutative() || tree->OperIsCompare() || tree->OperIs(GT_CMP));
1284
1285 GenTree* op1 = tree->gtGetOp1();
1286 GenTree* op2 = tree->gtGetOp2();
1287 assert(!op1->IsRegOptional() && !op2->IsRegOptional());
1288
1289 // We default to op1, as op2 is likely to have the shorter lifetime.
1290 GenTree* preferredOp = op1;
1291
1292 // This routine uses the following heuristics:
1293 //
1294 // a) If both are register candidates, marking the one with lower weighted
1295 // ref count as reg-optional would likely be beneficial as it has
1296 // higher probability of not getting a register. Note that we use !lvDoNotEnregister
1297 // here because this is being done while we are adding lclVars for Lowering.
1298 //
1299 // b) op1 = tracked local and op2 = untracked local: LSRA creates two
1300 // ref positions for op2: a def and use position. op2's def position
1301 // requires a reg and it is allocated a reg by spilling another
1302 // interval (if required) and that could be even op1. For this reason
1303 // it is beneficial to mark op1 as reg optional.
1304 //
1305 // TODO: It is not always mandatory for a def position of an untracked
1306 // local to be allocated a register if it is on rhs of an assignment
1307 // and its use position is reg-optional and has not been assigned a
1308 // register. Reg optional def positions is currently not yet supported.
1309 //
1310 // c) op1 = untracked local and op2 = tracked local: marking op1 as
1311 // reg optional is beneficial, since its use position is less likely
1312 // to get a register.
1313 //
1314 // d) If both are untracked locals (i.e. treated like tree temps by
1315 // LSRA): though either of them could be marked as reg optional,
1316 // marking op1 as reg optional is likely to be beneficial because
1317 // while allocating op2's def position, there is a possibility of
1318 // spilling op1's def and in which case op1 is treated as contained
1319 // memory operand rather than requiring to reload.
1320 //
1321 // e) If only one of them is a local var, prefer to mark it as
1322 // reg-optional. This is heuristic is based on the results
1323 // obtained against CQ perf benchmarks.
1324 //
1325 // f) If neither of them are local vars (i.e. tree temps), prefer to
1326 // mark op1 as reg optional for the same reason as mentioned in (d) above.
1327 if (op1->OperGet() == GT_LCL_VAR && op2->OperGet() == GT_LCL_VAR)
1328 {
1329 LclVarDsc* v1 = comp->lvaTable + op1->AsLclVarCommon()->GetLclNum();
1330 LclVarDsc* v2 = comp->lvaTable + op2->AsLclVarCommon()->GetLclNum();
1331
1332 bool v1IsRegCandidate = !v1->lvDoNotEnregister;
1333 bool v2IsRegCandidate = !v2->lvDoNotEnregister;
1334 if (v1IsRegCandidate && v2IsRegCandidate)
1335 {
1336 // Both are enregisterable locals. The one with lower weight is less likely
1337 // to get a register and hence beneficial to mark the one with lower
1338 // weight as reg optional.
1339 // If either is not tracked, it may be that it was introduced after liveness
1340 // was run, in which case we will always prefer op1 (should we use raw refcnt??).
1341 if (v1->lvTracked && v2->lvTracked && (v1->lvRefCntWtd() >= v2->lvRefCntWtd()))
1342 {
1343 preferredOp = op2;
1344 }
1345 }
1346 }
1347 else if (!(op1->OperGet() == GT_LCL_VAR) && (op2->OperGet() == GT_LCL_VAR))
1348 {
1349 preferredOp = op2;
1350 }
1351
1352 return preferredOp;
1353}
1354
1355//------------------------------------------------------------------------
1356// Containment analysis
1357//------------------------------------------------------------------------
1358
1359//------------------------------------------------------------------------
1360// ContainCheckCallOperands: Determine whether operands of a call should be contained.
1361//
1362// Arguments:
1363// call - The call node of interest
1364//
1365// Return Value:
1366// None.
1367//
1368void Lowering::ContainCheckCallOperands(GenTreeCall* call)
1369{
1370 GenTree* ctrlExpr = call->gtControlExpr;
1371 if (call->gtCallType == CT_INDIRECT)
1372 {
1373 // either gtControlExpr != null or gtCallAddr != null.
1374 // Both cannot be non-null at the same time.
1375 assert(ctrlExpr == nullptr);
1376 assert(call->gtCallAddr != nullptr);
1377 ctrlExpr = call->gtCallAddr;
1378
1379#ifdef _TARGET_X86_
1380 // Fast tail calls aren't currently supported on x86, but if they ever are, the code
1381 // below that handles indirect VSD calls will need to be fixed.
1382 assert(!call->IsFastTailCall() || !call->IsVirtualStub());
1383#endif // _TARGET_X86_
1384 }
1385
1386 // set reg requirements on call target represented as control sequence.
1387 if (ctrlExpr != nullptr)
1388 {
1389 // we should never see a gtControlExpr whose type is void.
1390 assert(ctrlExpr->TypeGet() != TYP_VOID);
1391
1392 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1393 // computed into a register.
1394 if (!call->IsFastTailCall())
1395 {
1396#ifdef _TARGET_X86_
1397 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1398 //
1399 // 3-byte nop
1400 // call dword ptr [eax]
1401 //
1402 // Where EAX is also used as an argument to the stub dispatch helper. Make
1403 // sure that the call target address is computed into EAX in this case.
1404 if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1405 {
1406 assert(ctrlExpr->isIndir());
1407 MakeSrcContained(call, ctrlExpr);
1408 }
1409 else
1410#endif // _TARGET_X86_
1411 if (ctrlExpr->isIndir())
1412 {
1413 // We may have cases where we have set a register target on the ctrlExpr, but if it
1414 // contained we must clear it.
1415 ctrlExpr->gtRegNum = REG_NA;
1416 MakeSrcContained(call, ctrlExpr);
1417 }
1418 }
1419 }
1420
1421 GenTree* args = call->gtCallArgs;
1422 while (args)
1423 {
1424 GenTree* arg = args->gtOp.gtOp1;
1425 if (arg->gtOper == GT_PUTARG_STK)
1426 {
1427 LowerPutArgStk(arg->AsPutArgStk());
1428 }
1429 args = args->gtOp.gtOp2;
1430 }
1431 args = call->gtCallLateArgs;
1432 while (args)
1433 {
1434 GenTree* arg = args->gtOp.gtOp1;
1435 if (arg->gtOper == GT_PUTARG_STK)
1436 {
1437 LowerPutArgStk(arg->AsPutArgStk());
1438 }
1439 args = args->gtOp.gtOp2;
1440 }
1441}
1442
1443//------------------------------------------------------------------------
1444// ContainCheckIndir: Determine whether operands of an indir should be contained.
1445//
1446// Arguments:
1447// node - The indirection node of interest
1448//
1449// Notes:
1450// This is called for both store and load indirections. In the former case, it is assumed that
1451// LowerStoreIndir() has already been called to check for RMW opportunities.
1452//
1453// Return Value:
1454// None.
1455//
1456void Lowering::ContainCheckIndir(GenTreeIndir* node)
1457{
1458 GenTree* addr = node->Addr();
1459
1460 // If this is the rhs of a block copy it will be handled when we handle the store.
1461 if (node->TypeGet() == TYP_STRUCT)
1462 {
1463 return;
1464 }
1465
1466#ifdef FEATURE_SIMD
1467 // If indirTree is of TYP_SIMD12, don't mark addr as contained
1468 // so that it always get computed to a register. This would
1469 // mean codegen side logic doesn't need to handle all possible
1470 // addr expressions that could be contained.
1471 //
1472 // TODO-XArch-CQ: handle other addr mode expressions that could be marked
1473 // as contained.
1474 if (node->TypeGet() == TYP_SIMD12)
1475 {
1476 return;
1477 }
1478#endif // FEATURE_SIMD
1479
1480 if ((node->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
1481 {
1482 // The address of an indirection that requires its address in a reg.
1483 // Skip any further processing that might otherwise make it contained.
1484 }
1485 else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
1486 {
1487 // These nodes go into an addr mode:
1488 // - GT_CLS_VAR_ADDR turns into a constant.
1489 // - GT_LCL_VAR_ADDR is a stack addr mode.
1490
1491 // make this contained, it turns into a constant that goes into an addr mode
1492 MakeSrcContained(node, addr);
1493 }
1494 else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
1495 {
1496 // Amd64:
1497 // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
1498 // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
1499 // VM requires us to pass stub addr in VirtualStubParam.reg - see LowerVirtualStubCall(). For
1500 // that reason we cannot mark such an addr as contained. Note that this is not an issue for
1501 // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
1502 // argument.
1503 //
1504 // Workaround:
1505 // Note that LowerVirtualStubCall() sets addr->gtRegNum to VirtualStubParam.reg and Lowering::doPhase()
1506 // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA.
1507 // Ideally we should set a flag on addr nodes that shouldn't be marked as contained
1508 // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround
1509 // an explicit check is made here.
1510 //
1511 // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
1512 MakeSrcContained(node, addr);
1513 }
1514 else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(node, addr))
1515 {
1516 MakeSrcContained(node, addr);
1517 }
1518}
1519
1520//------------------------------------------------------------------------
1521// ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
1522//
1523// Arguments:
1524// node - pointer to the node
1525//
1526void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
1527{
1528 // If the source is a containable immediate, make it contained, unless it is
1529 // an int-size or larger store of zero to memory, because we can generate smaller code
1530 // by zeroing a register and then storing it.
1531 GenTree* src = node->gtOp.gtOp2;
1532 if (IsContainableImmed(node, src) &&
1533 (!src->IsIntegralConst(0) || varTypeIsSmall(node) || node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
1534 {
1535 MakeSrcContained(node, src);
1536 }
1537 ContainCheckIndir(node);
1538}
1539
1540//------------------------------------------------------------------------
1541// ContainCheckMul: determine whether the sources of a MUL node should be contained.
1542//
1543// Arguments:
1544// node - pointer to the node
1545//
1546void Lowering::ContainCheckMul(GenTreeOp* node)
1547{
1548#if defined(_TARGET_X86_)
1549 assert(node->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
1550#else
1551 assert(node->OperIs(GT_MUL, GT_MULHI));
1552#endif
1553
1554 // Case of float/double mul.
1555 if (varTypeIsFloating(node->TypeGet()))
1556 {
1557 ContainCheckFloatBinary(node);
1558 return;
1559 }
1560
1561 GenTree* op1 = node->gtOp.gtOp1;
1562 GenTree* op2 = node->gtOp.gtOp2;
1563
1564 bool isSafeToContainOp1 = true;
1565 bool isSafeToContainOp2 = true;
1566
1567 bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != 0);
1568 bool requiresOverflowCheck = node->gtOverflowEx();
1569 bool useLeaEncoding = false;
1570 GenTree* memOp = nullptr;
1571
1572 bool hasImpliedFirstOperand = false;
1573 GenTreeIntConCommon* imm = nullptr;
1574 GenTree* other = nullptr;
1575
1576 // Multiply should never be using small types
1577 assert(!varTypeIsSmall(node->TypeGet()));
1578
1579 // We do use the widening multiply to implement
1580 // the overflow checking for unsigned multiply
1581 //
1582 if (isUnsignedMultiply && requiresOverflowCheck)
1583 {
1584 hasImpliedFirstOperand = true;
1585 }
1586 else if (node->OperGet() == GT_MULHI)
1587 {
1588 hasImpliedFirstOperand = true;
1589 }
1590#if defined(_TARGET_X86_)
1591 else if (node->OperGet() == GT_MUL_LONG)
1592 {
1593 hasImpliedFirstOperand = true;
1594 }
1595#endif
1596 else if (IsContainableImmed(node, op2) || IsContainableImmed(node, op1))
1597 {
1598 if (IsContainableImmed(node, op2))
1599 {
1600 imm = op2->AsIntConCommon();
1601 other = op1;
1602 }
1603 else
1604 {
1605 imm = op1->AsIntConCommon();
1606 other = op2;
1607 }
1608
1609 // CQ: We want to rewrite this into a LEA
1610 ssize_t immVal = imm->AsIntConCommon()->IconValue();
1611 if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
1612 {
1613 useLeaEncoding = true;
1614 }
1615
1616 MakeSrcContained(node, imm); // The imm is always contained
1617 if (IsContainableMemoryOp(other))
1618 {
1619 memOp = other; // memOp may be contained below
1620 }
1621 }
1622
1623 // We allow one operand to be a contained memory operand.
1624 // The memory op type must match with the 'node' type.
1625 // This is because during codegen we use 'node' type to derive EmitTypeSize.
1626 // E.g op1 type = byte, op2 type = byte but GT_MUL node type is int.
1627 //
1628 if (memOp == nullptr)
1629 {
1630 if ((op2->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op2))
1631 {
1632 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
1633 if (isSafeToContainOp2)
1634 {
1635 memOp = op2;
1636 }
1637 }
1638
1639 if ((memOp == nullptr) && (op1->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op1))
1640 {
1641 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
1642 if (isSafeToContainOp1)
1643 {
1644 memOp = op1;
1645 }
1646 }
1647 }
1648 else
1649 {
1650 if ((memOp->TypeGet() != node->TypeGet()))
1651 {
1652 memOp = nullptr;
1653 }
1654 else if (!IsSafeToContainMem(node, memOp))
1655 {
1656 if (memOp == op1)
1657 {
1658 isSafeToContainOp1 = false;
1659 }
1660 else
1661 {
1662 isSafeToContainOp2 = false;
1663 }
1664 memOp = nullptr;
1665 }
1666 }
1667 // To generate an LEA we need to force memOp into a register
1668 // so don't allow memOp to be 'contained'
1669 //
1670 if (!useLeaEncoding)
1671 {
1672 if (memOp != nullptr)
1673 {
1674 MakeSrcContained(node, memOp);
1675 }
1676 else
1677 {
1678 // IsSafeToContainMem is expensive so we call it at most once for each operand
1679 // in this method. If we already called IsSafeToContainMem, it must have returned false;
1680 // otherwise, memOp would be set to the corresponding operand (op1 or op2).
1681 if (imm != nullptr)
1682 {
1683 // Has a contained immediate operand.
1684 // Only 'other' operand can be marked as reg optional.
1685 assert(other != nullptr);
1686
1687 isSafeToContainOp1 = ((other == op1) && isSafeToContainOp1 && IsSafeToContainMem(node, op1));
1688 isSafeToContainOp2 = ((other == op2) && isSafeToContainOp2 && IsSafeToContainMem(node, op2));
1689 }
1690 else if (hasImpliedFirstOperand)
1691 {
1692 // Only op2 can be marked as reg optional.
1693 isSafeToContainOp1 = false;
1694 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1695 }
1696 else
1697 {
1698 // If there are no containable operands, we can make either of op1 or op2
1699 // as reg optional.
1700 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
1701 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1702 }
1703 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
1704 }
1705 }
1706}
1707
1708//------------------------------------------------------------------------
1709// ContainCheckDivOrMod: determine which operands of a div/mod should be contained.
1710//
1711// Arguments:
1712// node - pointer to the node
1713//
1714void Lowering::ContainCheckDivOrMod(GenTreeOp* node)
1715{
1716 assert(node->OperIs(GT_DIV, GT_MOD, GT_UDIV, GT_UMOD));
1717
1718 if (varTypeIsFloating(node->TypeGet()))
1719 {
1720 ContainCheckFloatBinary(node);
1721 return;
1722 }
1723
1724 GenTree* dividend = node->gtGetOp1();
1725 GenTree* divisor = node->gtGetOp2();
1726
1727 bool divisorCanBeRegOptional = true;
1728#ifdef _TARGET_X86_
1729 if (dividend->OperGet() == GT_LONG)
1730 {
1731 divisorCanBeRegOptional = false;
1732 MakeSrcContained(node, dividend);
1733 }
1734#endif
1735
1736 // divisor can be an r/m, but the memory indirection must be of the same size as the divide
1737 if (IsContainableMemoryOp(divisor) && (divisor->TypeGet() == node->TypeGet()))
1738 {
1739 MakeSrcContained(node, divisor);
1740 }
1741 else if (divisorCanBeRegOptional)
1742 {
1743 // If there are no containable operands, we can make an operand reg optional.
1744 // Div instruction allows only divisor to be a memory op.
1745 divisor->SetRegOptional();
1746 }
1747}
1748
1749//------------------------------------------------------------------------
1750// ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
1751//
1752// Arguments:
1753// node - pointer to the node
1754//
1755void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
1756{
1757 assert(node->OperIsShiftOrRotate());
1758#ifdef _TARGET_X86_
1759 GenTree* source = node->gtOp1;
1760 if (node->OperIsShiftLong())
1761 {
1762 assert(source->OperGet() == GT_LONG);
1763 MakeSrcContained(node, source);
1764 }
1765#endif // !_TARGET_X86_
1766
1767 GenTree* shiftBy = node->gtOp2;
1768 if (IsContainableImmed(node, shiftBy) && (shiftBy->gtIntConCommon.IconValue() <= 255) &&
1769 (shiftBy->gtIntConCommon.IconValue() >= 0))
1770 {
1771 MakeSrcContained(node, shiftBy);
1772 }
1773}
1774
1775//------------------------------------------------------------------------
1776// ContainCheckStoreLoc: determine whether the source of a STORE_LCL* should be contained.
1777//
1778// Arguments:
1779// node - pointer to the node
1780//
1781void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc)
1782{
1783 assert(storeLoc->OperIsLocalStore());
1784 GenTree* op1 = storeLoc->gtGetOp1();
1785
1786#ifdef FEATURE_SIMD
1787 if (varTypeIsSIMD(storeLoc))
1788 {
1789 if (op1->IsCnsIntOrI())
1790 {
1791 // For an InitBlk we want op1 to be contained; otherwise we want it to
1792 // be evaluated into an xmm register.
1793 MakeSrcContained(storeLoc, op1);
1794 }
1795 return;
1796 }
1797#endif // FEATURE_SIMD
1798
1799 // If the source is a containable immediate, make it contained, unless it is
1800 // an int-size or larger store of zero to memory, because we can generate smaller code
1801 // by zeroing a register and then storing it.
1802 if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
1803 {
1804 MakeSrcContained(storeLoc, op1);
1805 }
1806#ifdef _TARGET_X86_
1807 else if (op1->OperGet() == GT_LONG)
1808 {
1809 MakeSrcContained(storeLoc, op1);
1810 }
1811#endif // _TARGET_X86_
1812}
1813
1814//------------------------------------------------------------------------
1815// ContainCheckCast: determine whether the source of a CAST node should be contained.
1816//
1817// Arguments:
1818// node - pointer to the node
1819//
1820void Lowering::ContainCheckCast(GenTreeCast* node)
1821{
1822 GenTree* castOp = node->CastOp();
1823 var_types castToType = node->CastToType();
1824 var_types srcType = castOp->TypeGet();
1825
1826 // force the srcType to unsigned if GT_UNSIGNED flag is set
1827 if (node->gtFlags & GTF_UNSIGNED)
1828 {
1829 srcType = genUnsignedType(srcType);
1830 }
1831
1832 if (!node->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(srcType)))
1833 {
1834#ifdef DEBUG
1835 // If converting to float/double, the operand must be 4 or 8 byte in size.
1836 if (varTypeIsFloating(castToType))
1837 {
1838 unsigned opSize = genTypeSize(srcType);
1839 assert(opSize == 4 || opSize == 8);
1840 }
1841#endif // DEBUG
1842
1843 // U8 -> R8 conversion requires that the operand be in a register.
1844 if (srcType != TYP_ULONG)
1845 {
1846 if (IsContainableMemoryOp(castOp) || castOp->IsCnsNonZeroFltOrDbl())
1847 {
1848 MakeSrcContained(node, castOp);
1849 }
1850 else
1851 {
1852 // Mark castOp as reg optional to indicate codegen
1853 // can still generate code if it is on stack.
1854 castOp->SetRegOptional();
1855 }
1856 }
1857 }
1858#if !defined(_TARGET_64BIT_)
1859 if (varTypeIsLong(srcType))
1860 {
1861 noway_assert(castOp->OperGet() == GT_LONG);
1862 castOp->SetContained();
1863 }
1864#endif // !defined(_TARGET_64BIT_)
1865}
1866
1867//------------------------------------------------------------------------
1868// ContainCheckCompare: determine whether the sources of a compare node should be contained.
1869//
1870// Arguments:
1871// node - pointer to the node
1872//
1873void Lowering::ContainCheckCompare(GenTreeOp* cmp)
1874{
1875 assert(cmp->OperIsCompare() || cmp->OperIs(GT_CMP));
1876
1877 GenTree* op1 = cmp->gtOp.gtOp1;
1878 GenTree* op2 = cmp->gtOp.gtOp2;
1879 var_types op1Type = op1->TypeGet();
1880 var_types op2Type = op2->TypeGet();
1881
1882 // If either of op1 or op2 is floating point values, then we need to use
1883 // ucomiss or ucomisd to compare, both of which support the following form:
1884 // ucomis[s|d] xmm, xmm/mem
1885 // That is only the second operand can be a memory op.
1886 //
1887 // Second operand is a memory Op: Note that depending on comparison operator,
1888 // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
1889 // op2 can be a memory op depending on the comparison operator.
1890 if (varTypeIsFloating(op1Type))
1891 {
1892 // The type of the operands has to be the same and no implicit conversions at this stage.
1893 assert(op1Type == op2Type);
1894
1895 bool reverseOps;
1896 if ((cmp->gtFlags & GTF_RELOP_NAN_UN) != 0)
1897 {
1898 // Unordered comparison case
1899 reverseOps = cmp->OperIs(GT_GT, GT_GE);
1900 }
1901 else
1902 {
1903 reverseOps = cmp->OperIs(GT_LT, GT_LE);
1904 }
1905
1906 GenTree* otherOp;
1907 if (reverseOps)
1908 {
1909 otherOp = op1;
1910 }
1911 else
1912 {
1913 otherOp = op2;
1914 }
1915
1916 assert(otherOp != nullptr);
1917 bool isSafeToContainOtherOp = true;
1918 if (otherOp->IsCnsNonZeroFltOrDbl())
1919 {
1920 MakeSrcContained(cmp, otherOp);
1921 }
1922 else if (IsContainableMemoryOp(otherOp))
1923 {
1924 isSafeToContainOtherOp = IsSafeToContainMem(cmp, otherOp);
1925 if (isSafeToContainOtherOp)
1926 {
1927 MakeSrcContained(cmp, otherOp);
1928 }
1929 }
1930
1931 if (!otherOp->isContained() && isSafeToContainOtherOp && IsSafeToContainMem(cmp, otherOp))
1932 {
1933 // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
1934 // contained, we can mark it reg-optional.
1935 // IsSafeToContainMem is expensive so we call it at most once for otherOp.
1936 // If we already called IsSafeToContainMem, it must have returned false;
1937 // otherwise, otherOp would be contained.
1938 otherOp->SetRegOptional();
1939 }
1940
1941 return;
1942 }
1943
1944 // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
1945 // or in other backend.
1946
1947 if (CheckImmedAndMakeContained(cmp, op2))
1948 {
1949 // If the types are the same, or if the constant is of the correct size,
1950 // we can treat the MemoryOp as contained.
1951 if (op1Type == op2Type)
1952 {
1953 if (IsContainableMemoryOp(op1))
1954 {
1955 MakeSrcContained(cmp, op1);
1956 }
1957 else
1958 {
1959 op1->SetRegOptional();
1960 }
1961 }
1962 }
1963 else if (op1Type == op2Type)
1964 {
1965 // Note that TEST does not have a r,rm encoding like CMP has but we can still
1966 // contain the second operand because the emitter maps both r,rm and rm,r to
1967 // the same instruction code. This avoids the need to special case TEST here.
1968
1969 bool isSafeToContainOp1 = true;
1970 bool isSafeToContainOp2 = true;
1971
1972 if (IsContainableMemoryOp(op2))
1973 {
1974 isSafeToContainOp2 = IsSafeToContainMem(cmp, op2);
1975 if (isSafeToContainOp2)
1976 {
1977 MakeSrcContained(cmp, op2);
1978 }
1979 }
1980
1981 if (!op2->isContained() && IsContainableMemoryOp(op1))
1982 {
1983 isSafeToContainOp1 = IsSafeToContainMem(cmp, op1);
1984 if (isSafeToContainOp1)
1985 {
1986 MakeSrcContained(cmp, op1);
1987 }
1988 }
1989
1990 if (!op1->isContained() && !op2->isContained())
1991 {
1992 // One of op1 or op2 could be marked as reg optional
1993 // to indicate that codegen can still generate code
1994 // if one of them is on stack.
1995 GenTree* regOptionalCandidate = op1->IsCnsIntOrI() ? op2 : PreferredRegOptionalOperand(cmp);
1996
1997 // IsSafeToContainMem is expensive so we call it at most once for each operand
1998 // in this method. If we already called IsSafeToContainMem, it must have returned false;
1999 // otherwise, the corresponding operand (op1 or op2) would be contained.
2000 bool setRegOptional = (regOptionalCandidate == op1) ? isSafeToContainOp1 && IsSafeToContainMem(cmp, op1)
2001 : isSafeToContainOp2 && IsSafeToContainMem(cmp, op2);
2002 if (setRegOptional)
2003 {
2004 regOptionalCandidate->SetRegOptional();
2005 }
2006 }
2007 }
2008}
2009
2010//------------------------------------------------------------------------
2011// LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
2012//
2013// Arguments:
2014// node - The indirect store node (GT_STORE_IND) of interest
2015//
2016// Return Value:
2017// Returns true if 'node' is a valid RMW mem op; false otherwise.
2018//
2019bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
2020{
2021 assert(storeInd->OperGet() == GT_STOREIND);
2022
2023 // SSE2 doesn't support RMW on float values
2024 assert(!varTypeIsFloating(storeInd));
2025
2026 // Terminology:
2027 // indirDst = memory write of an addr mode (i.e. storeind destination)
2028 // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
2029 // indirCandidate = memory read i.e. a gtInd of an addr mode
2030 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
2031
2032 GenTree* indirCandidate = nullptr;
2033 GenTree* indirOpSource = nullptr;
2034
2035 if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
2036 {
2037 JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
2038 storeInd->AsStoreInd()->GetRMWStatus());
2039 DISPTREERANGE(BlockRange(), storeInd);
2040 return false;
2041 }
2042
2043 GenTree* indirDst = storeInd->gtGetOp1();
2044 GenTree* indirSrc = storeInd->gtGetOp2();
2045 genTreeOps oper = indirSrc->OperGet();
2046
2047 // At this point we have successfully detected a RMW memory op of one of the following forms
2048 // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
2049 // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
2050 // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
2051 //
2052 // Here indirSrc = one of the supported binary or unary operation for RMW of memory
2053 // indirCandidate = a GT_IND node
2054 // indirCandidateChild = operand of GT_IND indirCandidate
2055 //
2056 // The logic below does the following
2057 // Make indirOpSource contained.
2058 // Make indirSrc contained.
2059 // Make indirCandidate contained.
2060 // Make indirCandidateChild contained.
2061 // Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
2062 // base.
2063 //
2064
2065 // We have already done containment analysis on the indirSrc op.
2066 // If any of its operands are marked regOptional, reset that now.
2067 indirSrc->AsOp()->gtOp1->ClearRegOptional();
2068 if (GenTree::OperIsBinary(oper))
2069 {
2070 // On Xarch RMW operations require the source to be an immediate or in a register.
2071 // Therefore, if we have previously marked the indirOpSource as contained while lowering
2072 // the binary node, we need to reset that now.
2073 if (IsContainableMemoryOp(indirOpSource))
2074 {
2075 indirOpSource->ClearContained();
2076 }
2077 indirSrc->AsOp()->gtOp2->ClearRegOptional();
2078 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
2079 }
2080 else
2081 {
2082 assert(GenTree::OperIsUnary(oper));
2083 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
2084 }
2085 DISPTREERANGE(BlockRange(), storeInd);
2086
2087 indirSrc->SetContained();
2088 indirCandidate->SetContained();
2089
2090 GenTree* indirCandidateChild = indirCandidate->gtGetOp1();
2091 indirCandidateChild->SetContained();
2092
2093 if (indirCandidateChild->OperGet() == GT_LEA)
2094 {
2095 GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
2096
2097 if (addrMode->HasBase())
2098 {
2099 assert(addrMode->Base()->OperIsLeaf());
2100 addrMode->Base()->SetContained();
2101 }
2102
2103 if (addrMode->HasIndex())
2104 {
2105 assert(addrMode->Index()->OperIsLeaf());
2106 addrMode->Index()->SetContained();
2107 }
2108
2109 indirDst->SetContained();
2110 }
2111 else
2112 {
2113 assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
2114 indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
2115
2116 // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
2117 // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
2118 // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
2119 // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
2120 if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
2121 {
2122 indirDst->SetContained();
2123 }
2124 else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
2125 {
2126 indirDst->SetContained();
2127 }
2128 }
2129 return true;
2130}
2131
2132//------------------------------------------------------------------------
2133// ContainCheckBinary: Determine whether a binary op's operands should be contained.
2134//
2135// Arguments:
2136// node - the node we care about
2137//
2138void Lowering::ContainCheckBinary(GenTreeOp* node)
2139{
2140 assert(node->OperIsBinary());
2141
2142 if (varTypeIsFloating(node))
2143 {
2144 assert(node->OperIs(GT_ADD, GT_SUB));
2145 ContainCheckFloatBinary(node);
2146 return;
2147 }
2148
2149 GenTree* op1 = node->gtOp1;
2150 GenTree* op2 = node->gtOp2;
2151
2152 // We can directly encode the second operand if it is either a containable constant or a memory-op.
2153 // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
2154 // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
2155 // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
2156 bool directlyEncodable = false;
2157 bool binOpInRMW = false;
2158 GenTree* operand = nullptr;
2159 bool isSafeToContainOp1 = true;
2160 bool isSafeToContainOp2 = true;
2161
2162 if (IsContainableImmed(node, op2))
2163 {
2164 directlyEncodable = true;
2165 operand = op2;
2166 }
2167 else
2168 {
2169 binOpInRMW = IsBinOpInRMWStoreInd(node);
2170 if (!binOpInRMW)
2171 {
2172 const unsigned operatorSize = genTypeSize(node->TypeGet());
2173 if ((genTypeSize(op2->TypeGet()) == operatorSize) && IsContainableMemoryOp(op2))
2174 {
2175 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
2176 if (isSafeToContainOp2)
2177 {
2178 directlyEncodable = true;
2179 operand = op2;
2180 }
2181 }
2182
2183 if ((operand == nullptr) && node->OperIsCommutative())
2184 {
2185 // If it is safe, we can reverse the order of operands of commutative operations for efficient
2186 // codegen
2187 if (IsContainableImmed(node, op1))
2188 {
2189 directlyEncodable = true;
2190 operand = op1;
2191 }
2192 else if ((genTypeSize(op1->TypeGet()) == operatorSize) && IsContainableMemoryOp(op1))
2193 {
2194 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
2195 if (isSafeToContainOp1)
2196 {
2197 directlyEncodable = true;
2198 operand = op1;
2199 }
2200 }
2201 }
2202 }
2203 }
2204
2205 if (directlyEncodable)
2206 {
2207 assert(operand != nullptr);
2208 MakeSrcContained(node, operand);
2209 }
2210 else if (!binOpInRMW)
2211 {
2212 // If this binary op neither has contained operands, nor is a
2213 // Read-Modify-Write (RMW) operation, we can mark its operands
2214 // as reg optional.
2215
2216 // IsSafeToContainMem is expensive so we call it at most once for each operand
2217 // in this method. If we already called IsSafeToContainMem, it must have returned false;
2218 // otherwise, directlyEncodable would be true.
2219 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
2220 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
2221
2222 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
2223 }
2224}
2225
2226//------------------------------------------------------------------------
2227// ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
2228//
2229// Arguments:
2230// node - pointer to the node
2231//
2232void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
2233{
2234 assert(node->OperIsBoundsCheck());
2235 GenTree* other;
2236 if (CheckImmedAndMakeContained(node, node->gtIndex))
2237 {
2238 other = node->gtArrLen;
2239 }
2240 else if (CheckImmedAndMakeContained(node, node->gtArrLen))
2241 {
2242 other = node->gtIndex;
2243 }
2244 else if (IsContainableMemoryOp(node->gtIndex))
2245 {
2246 other = node->gtIndex;
2247 }
2248 else
2249 {
2250 other = node->gtArrLen;
2251 }
2252
2253 if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
2254 {
2255 if (IsContainableMemoryOp(other))
2256 {
2257 MakeSrcContained(node, other);
2258 }
2259 else
2260 {
2261 // We can mark 'other' as reg optional, since it is not contained.
2262 other->SetRegOptional();
2263 }
2264 }
2265}
2266
2267//------------------------------------------------------------------------
2268// ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
2269//
2270// Arguments:
2271// node - pointer to the node
2272//
2273void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
2274{
2275 assert(node->OperIs(GT_INTRINSIC));
2276
2277 CorInfoIntrinsics intrinsicId = node->gtIntrinsic.gtIntrinsicId;
2278
2279 if (intrinsicId == CORINFO_INTRINSIC_Sqrt || intrinsicId == CORINFO_INTRINSIC_Round ||
2280 intrinsicId == CORINFO_INTRINSIC_Ceiling || intrinsicId == CORINFO_INTRINSIC_Floor)
2281 {
2282 GenTree* op1 = node->gtGetOp1();
2283 if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl())
2284 {
2285 MakeSrcContained(node, op1);
2286 }
2287 else
2288 {
2289 // Mark the operand as reg optional since codegen can still
2290 // generate code if op1 is on stack.
2291 op1->SetRegOptional();
2292 }
2293 }
2294}
2295
2296#ifdef FEATURE_SIMD
2297//----------------------------------------------------------------------------------------------
2298// ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
2299//
2300// Arguments:
2301// simdNode - The SIMD intrinsic node.
2302//
2303void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
2304{
2305 switch (simdNode->gtSIMDIntrinsicID)
2306 {
2307 GenTree* op1;
2308 GenTree* op2;
2309
2310 case SIMDIntrinsicInit:
2311 {
2312 op1 = simdNode->gtOp.gtOp1;
2313#ifndef _TARGET_64BIT_
2314 if (op1->OperGet() == GT_LONG)
2315 {
2316 MakeSrcContained(simdNode, op1);
2317 GenTree* op1lo = op1->gtGetOp1();
2318 GenTree* op1hi = op1->gtGetOp2();
2319
2320 if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
2321 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
2322 {
2323 MakeSrcContained(op1, op1lo);
2324 MakeSrcContained(op1, op1hi);
2325 }
2326 }
2327 else
2328#endif // !_TARGET_64BIT_
2329 if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
2330 (varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
2331 {
2332 MakeSrcContained(simdNode, op1);
2333 }
2334 else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
2335 ((simdNode->gtSIMDSize == 16) || (simdNode->gtSIMDSize == 32)))
2336 {
2337 // Either op1 is a float or dbl constant or an addr
2338 if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
2339 {
2340 MakeSrcContained(simdNode, op1);
2341 }
2342 }
2343 }
2344 break;
2345
2346 case SIMDIntrinsicInitArray:
2347 // We have an array and an index, which may be contained.
2348 CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
2349 break;
2350
2351 case SIMDIntrinsicOpEquality:
2352 case SIMDIntrinsicOpInEquality:
2353 // On SSE4/AVX, we can generate optimal code for (in)equality
2354 // against zero using ptest. We can safely do this optimization
2355 // for integral vectors but not for floating-point for the reason
2356 // that we have +0.0 and -0.0 and +0.0 == -0.0
2357 op2 = simdNode->gtGetOp2();
2358 if ((comp->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0))
2359 {
2360 MakeSrcContained(simdNode, op2);
2361 }
2362 break;
2363
2364 case SIMDIntrinsicGetItem:
2365 {
2366 // This implements get_Item method. The sources are:
2367 // - the source SIMD struct
2368 // - index (which element to get)
2369 // The result is baseType of SIMD struct.
2370 op1 = simdNode->gtOp.gtOp1;
2371 op2 = simdNode->gtOp.gtOp2;
2372
2373 if (op1->OperGet() == GT_IND)
2374 {
2375 assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
2376 op1->AsIndir()->Addr()->ClearContained();
2377 }
2378 // If the index is a constant, mark it as contained.
2379 CheckImmedAndMakeContained(simdNode, op2);
2380
2381 if (IsContainableMemoryOp(op1))
2382 {
2383 MakeSrcContained(simdNode, op1);
2384 if (op1->OperGet() == GT_IND)
2385 {
2386 op1->AsIndir()->Addr()->ClearContained();
2387 }
2388 }
2389 }
2390 break;
2391
2392 case SIMDIntrinsicShuffleSSE2:
2393 // Second operand is an integer constant and marked as contained.
2394 assert(simdNode->gtOp.gtOp2->IsCnsIntOrI());
2395 MakeSrcContained(simdNode, simdNode->gtOp.gtOp2);
2396 break;
2397
2398 default:
2399 break;
2400 }
2401}
2402#endif // FEATURE_SIMD
2403
2404#ifdef FEATURE_HW_INTRINSICS
2405//----------------------------------------------------------------------------------------------
2406// IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
2407//
2408// Arguments:
2409// containingNode - The hardware intrinsic node which contains 'node'
2410// node - The node to check
2411// [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands;
2412// otherwise, false.
2413//
2414// Return Value:
2415// true if 'node' is a containable hardware intrinsic node; otherwise, false.
2416//
2417bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node, bool* supportsRegOptional)
2418{
2419 NamedIntrinsic containingIntrinsicId = containingNode->gtHWIntrinsicId;
2420 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(containingIntrinsicId);
2421
2422 // We shouldn't have called in here if containingNode doesn't support containment
2423 assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId));
2424
2425 // containingNode supports nodes that read from an aligned memory address
2426 //
2427 // This will generally be an explicit LoadAligned instruction and is generally
2428 // false for machines with VEX support. This is because there is currently no way
2429 // to guarantee that the address read from will always be aligned and we could silently
2430 // change the behavior of the program in the case where an Access Violation would have
2431 // otherwise occurred.
2432 bool supportsAlignedSIMDLoads = false;
2433
2434 // containingNode supports nodes that read from general memory
2435 //
2436 // We currently have to assume all "general" loads are unaligned. As such, this is
2437 // generally used to determine if we can mark the node as `regOptional` in the case
2438 // where `node` is not containable. However, this can also be used to determine whether
2439 // we can mark other types of reads as contained (such as when directly reading a local).
2440 bool supportsGeneralLoads = false;
2441
2442 // containingNode supports nodes that read from a scalar memory address
2443 //
2444 // This will generally be an explicit LoadScalar instruction but is also used to determine
2445 // whether we can read an address of type T (we don't support this when the load would
2446 // read more than sizeof(T) bytes).
2447 bool supportsSIMDScalarLoads = false;
2448
2449 // containingNode supports nodes that read from an unaligned memory address
2450 //
2451 // This will generally be an explicit Load instruction and is generally false for machines
2452 // without VEX support. This is because older hardware required that the SIMD operand always
2453 // be aligned to the 'natural alignment' of the type.
2454 bool supportsUnalignedSIMDLoads = false;
2455
2456 switch (category)
2457 {
2458 case HW_Category_SimpleSIMD:
2459 {
2460 // These intrinsics only expect 16 or 32-byte nodes for containment
2461 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2462 assert(supportsSIMDScalarLoads == false);
2463
2464 supportsAlignedSIMDLoads =
2465 !comp->canUseVexEncoding() && (containingIntrinsicId != NI_SSE2_ConvertToVector128Double);
2466 supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2467 supportsGeneralLoads = supportsUnalignedSIMDLoads;
2468
2469 break;
2470 }
2471
2472 case HW_Category_IMM:
2473 {
2474 switch (containingIntrinsicId)
2475 {
2476 case NI_SSE_Shuffle:
2477 case NI_SSE2_CompareLessThan:
2478 case NI_SSE2_ShiftLeftLogical:
2479 case NI_SSE2_ShiftRightArithmetic:
2480 case NI_SSE2_ShiftRightLogical:
2481 case NI_SSE2_Shuffle:
2482 case NI_SSE2_ShuffleHigh:
2483 case NI_SSE2_ShuffleLow:
2484 case NI_SSSE3_AlignRight:
2485 case NI_SSE41_Blend:
2486 case NI_SSE41_DotProduct:
2487 case NI_SSE41_MultipleSumAbsoluteDifferences:
2488 case NI_AES_KeygenAssist:
2489 case NI_PCLMULQDQ_CarrylessMultiply:
2490 case NI_AVX_Blend:
2491 case NI_AVX_Compare:
2492 case NI_AVX_DotProduct:
2493 case NI_AVX_InsertVector128:
2494 case NI_AVX_Permute:
2495 case NI_AVX_Permute2x128:
2496 case NI_AVX2_Blend:
2497 case NI_AVX2_InsertVector128:
2498 case NI_AVX2_MultipleSumAbsoluteDifferences:
2499 case NI_AVX2_Permute2x128:
2500 case NI_AVX2_Permute4x64:
2501 case NI_AVX2_ShiftLeftLogical:
2502 case NI_AVX2_ShiftRightArithmetic:
2503 case NI_AVX2_ShiftRightLogical:
2504 case NI_AVX2_ShuffleHigh:
2505 case NI_AVX2_ShuffleLow:
2506 {
2507 // These intrinsics only expect 16 or 32-byte nodes for containment
2508 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2509 assert(supportsSIMDScalarLoads == false);
2510
2511 supportsAlignedSIMDLoads = !comp->canUseVexEncoding();
2512 supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2513 supportsGeneralLoads = supportsUnalignedSIMDLoads;
2514
2515 break;
2516 }
2517
2518 case NI_SSE2_Insert:
2519 case NI_SSE41_Insert:
2520 case NI_SSE41_X64_Insert:
2521 {
2522 if (containingNode->gtSIMDBaseType == TYP_FLOAT)
2523 {
2524 assert(containingIntrinsicId == NI_SSE41_Insert);
2525 assert(genTypeSize(node->TypeGet()) == 16);
2526
2527 // Sse41.Insert(V128<float>, V128<float>, byte) is a bit special
2528 // in that it has different behavior depending on whether the
2529 // second operand is coming from a register or memory. When coming
2530 // from a register, all 4 elements of the vector can be used and it
2531 // is effectively a regular `SimpleSIMD` operation; but when loading
2532 // from memory, it only works with the lowest element and is effectively
2533 // a `SIMDScalar`.
2534
2535 assert(supportsAlignedSIMDLoads == false);
2536 assert(supportsUnalignedSIMDLoads == false);
2537 assert(supportsGeneralLoads == false);
2538 assert(supportsSIMDScalarLoads == false);
2539
2540 GenTree* op1 = containingNode->gtGetOp1();
2541 GenTree* op2 = nullptr;
2542 GenTree* op3 = nullptr;
2543
2544 assert(op1->OperIsList());
2545 assert(containingNode->gtGetOp2() == nullptr);
2546
2547 GenTreeArgList* argList = op1->AsArgList();
2548
2549 op1 = argList->Current();
2550 argList = argList->Rest();
2551
2552 op2 = argList->Current();
2553 argList = argList->Rest();
2554
2555 assert(node == op2);
2556
2557 op3 = argList->Current();
2558
2559 // The upper two bits of the immediate value are ignored if
2560 // op2 comes from memory. In order to support using the upper
2561 // bits, we need to disable containment support if op3 is not
2562 // constant or if the constant is greater than 0x3F (which means
2563 // at least one of the upper two bits is set).
2564
2565 if (op3->IsCnsIntOrI())
2566 {
2567 ssize_t ival = op3->AsIntCon()->IconValue();
2568 assert((ival >= 0) && (ival <= 255));
2569
2570 supportsSIMDScalarLoads = (ival <= 0x3F);
2571 supportsGeneralLoads = supportsSIMDScalarLoads;
2572 }
2573 break;
2574 }
2575
2576 // We should only get here for integral nodes.
2577 assert(varTypeIsIntegral(node->TypeGet()));
2578
2579 assert(supportsAlignedSIMDLoads == false);
2580 assert(supportsUnalignedSIMDLoads == false);
2581 assert(supportsSIMDScalarLoads == false);
2582
2583 const unsigned expectedSize = genTypeSize(containingNode->gtSIMDBaseType);
2584 const unsigned operandSize = genTypeSize(node->TypeGet());
2585
2586 supportsGeneralLoads = (operandSize >= expectedSize);
2587 break;
2588 }
2589
2590 case NI_AVX_CompareScalar:
2591 {
2592 // These intrinsics only expect 16 or 32-byte nodes for containment
2593 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2594
2595 assert(supportsAlignedSIMDLoads == false);
2596 assert(supportsUnalignedSIMDLoads == false);
2597
2598 supportsSIMDScalarLoads = true;
2599 supportsGeneralLoads = supportsSIMDScalarLoads;
2600 break;
2601 }
2602
2603 default:
2604 {
2605 assert(supportsAlignedSIMDLoads == false);
2606 assert(supportsGeneralLoads == false);
2607 assert(supportsSIMDScalarLoads == false);
2608 assert(supportsUnalignedSIMDLoads == false);
2609 break;
2610 }
2611 }
2612 break;
2613 }
2614
2615 case HW_Category_SIMDScalar:
2616 {
2617 assert(supportsAlignedSIMDLoads == false);
2618 assert(supportsUnalignedSIMDLoads == false);
2619
2620 switch (containingIntrinsicId)
2621 {
2622 case NI_Base_Vector128_CreateScalarUnsafe:
2623 case NI_Base_Vector256_CreateScalarUnsafe:
2624 {
2625 assert(supportsSIMDScalarLoads == false);
2626
2627 const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2628 const unsigned operandSize = genTypeSize(node->TypeGet());
2629
2630 supportsGeneralLoads = (operandSize == expectedSize);
2631 break;
2632 }
2633
2634 case NI_SSE_ConvertScalarToVector128Single:
2635 case NI_SSE2_ConvertScalarToVector128Double:
2636 case NI_SSE2_ConvertScalarToVector128Int32:
2637 case NI_SSE2_ConvertScalarToVector128UInt32:
2638 case NI_SSE_X64_ConvertScalarToVector128Single:
2639 case NI_SSE2_X64_ConvertScalarToVector128Double:
2640 case NI_SSE2_X64_ConvertScalarToVector128Int64:
2641 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
2642 {
2643 if (!varTypeIsIntegral(node->TypeGet()))
2644 {
2645 // The floating-point overload doesn't require any special semantics
2646 assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double);
2647 supportsSIMDScalarLoads = true;
2648 supportsGeneralLoads = supportsSIMDScalarLoads;
2649 break;
2650 }
2651
2652 assert(supportsSIMDScalarLoads == false);
2653
2654 const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2655 const unsigned operandSize = genTypeSize(node->TypeGet());
2656
2657 supportsGeneralLoads = (operandSize == expectedSize);
2658 break;
2659 }
2660
2661 default:
2662 {
2663 // These intrinsics only expect 16 or 32-byte nodes for containment
2664 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2665
2666 supportsSIMDScalarLoads = true;
2667 supportsGeneralLoads = supportsSIMDScalarLoads;
2668 break;
2669 }
2670 }
2671 break;
2672 }
2673
2674 case HW_Category_Scalar:
2675 {
2676 // We should only get here for integral nodes.
2677 assert(varTypeIsIntegral(node->TypeGet()));
2678
2679 assert(supportsAlignedSIMDLoads == false);
2680 assert(supportsUnalignedSIMDLoads == false);
2681 assert(supportsSIMDScalarLoads == false);
2682
2683 const unsigned expectedSize = genTypeSize(containingNode->TypeGet());
2684 const unsigned operandSize = genTypeSize(node->TypeGet());
2685
2686 supportsGeneralLoads = (operandSize >= expectedSize);
2687 break;
2688 }
2689
2690 default:
2691 {
2692 assert(supportsAlignedSIMDLoads == false);
2693 assert(supportsGeneralLoads == false);
2694 assert(supportsSIMDScalarLoads == false);
2695 assert(supportsUnalignedSIMDLoads == false);
2696 break;
2697 }
2698 }
2699
2700 noway_assert(supportsRegOptional != nullptr);
2701 *supportsRegOptional = supportsGeneralLoads;
2702
2703 if (!node->OperIsHWIntrinsic())
2704 {
2705 return supportsGeneralLoads && IsContainableMemoryOp(node);
2706 }
2707
2708 // TODO-XArch: Update this to be table driven, if possible.
2709
2710 NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->gtHWIntrinsicId;
2711
2712 switch (intrinsicId)
2713 {
2714 case NI_SSE_LoadAlignedVector128:
2715 case NI_SSE2_LoadAlignedVector128:
2716 case NI_AVX_LoadAlignedVector256:
2717 {
2718 return supportsAlignedSIMDLoads;
2719 }
2720
2721 case NI_SSE_LoadScalarVector128:
2722 case NI_SSE2_LoadScalarVector128:
2723 {
2724 return supportsSIMDScalarLoads;
2725 }
2726
2727 // VEX encoding supports unaligned memory ops, so we can fold them
2728 case NI_SSE_LoadVector128:
2729 case NI_SSE2_LoadVector128:
2730 case NI_AVX_LoadVector256:
2731 {
2732 return supportsUnalignedSIMDLoads;
2733 }
2734
2735 default:
2736 {
2737 assert(!node->isContainableHWIntrinsic());
2738 return false;
2739 }
2740 }
2741}
2742
2743//----------------------------------------------------------------------------------------------
2744// ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
2745//
2746// Arguments:
2747// node - The hardware intrinsic node.
2748//
2749void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
2750{
2751 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2752 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
2753 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2754 var_types baseType = node->gtSIMDBaseType;
2755
2756 GenTree* op1 = node->gtGetOp1();
2757 GenTree* op2 = node->gtGetOp2();
2758 GenTree* op3 = nullptr;
2759
2760 if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
2761 {
2762 // AVX2 gather are not contaibable and always have constant IMM argument
2763 if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
2764 {
2765 GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
2766 assert(lastOp != nullptr);
2767 MakeSrcContained(node, lastOp);
2768 }
2769 // Exit early if containment isn't supported
2770 return;
2771 }
2772
2773 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
2774
2775 const bool isCommutative = HWIntrinsicInfo::IsCommutative(intrinsicId);
2776
2777 if (numArgs == 1)
2778 {
2779 // One argument intrinsics cannot be commutative
2780 assert(!isCommutative);
2781
2782 assert(!op1->OperIsList());
2783 assert(op2 == nullptr);
2784
2785 switch (category)
2786 {
2787 case HW_Category_SimpleSIMD:
2788 case HW_Category_SIMDScalar:
2789 case HW_Category_Scalar:
2790 {
2791 switch (intrinsicId)
2792 {
2793 case NI_SSE_ReciprocalScalar:
2794 case NI_SSE_ReciprocalSqrtScalar:
2795 case NI_SSE_SqrtScalar:
2796 case NI_SSE2_SqrtScalar:
2797 case NI_SSE41_CeilingScalar:
2798 case NI_SSE41_FloorScalar:
2799 case NI_SSE41_RoundCurrentDirectionScalar:
2800 case NI_SSE41_RoundToNearestIntegerScalar:
2801 case NI_SSE41_RoundToNegativeInfinityScalar:
2802 case NI_SSE41_RoundToPositiveInfinityScalar:
2803 case NI_SSE41_RoundToZeroScalar:
2804 {
2805 // These intrinsics have both 1 and 2-operand overloads.
2806 //
2807 // The 1-operand overload basically does `intrinsic(op1, op1)`
2808 //
2809 // Because of this, the operand must be loaded into a register
2810 // and cannot be contained.
2811 return;
2812 }
2813
2814 case NI_SSE2_ConvertToInt32:
2815 case NI_SSE2_X64_ConvertToInt64:
2816 case NI_SSE2_ConvertToUInt32:
2817 case NI_SSE2_X64_ConvertToUInt64:
2818 case NI_AVX2_ConvertToInt32:
2819 case NI_AVX2_ConvertToUInt32:
2820 {
2821 if (varTypeIsIntegral(baseType))
2822 {
2823 // These intrinsics are "ins reg/mem, xmm" and don't
2824 // currently support containment.
2825 return;
2826 }
2827
2828 break;
2829 }
2830
2831 default:
2832 {
2833 break;
2834 }
2835 }
2836
2837 bool supportsRegOptional = false;
2838
2839 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2840 {
2841 MakeSrcContained(node, op1);
2842 }
2843 else if (supportsRegOptional)
2844 {
2845 op1->SetRegOptional();
2846 }
2847 break;
2848 }
2849
2850 default:
2851 {
2852 unreached();
2853 break;
2854 }
2855 }
2856 }
2857 else
2858 {
2859 if (numArgs == 2)
2860 {
2861 assert(!op1->OperIsList());
2862 assert(op2 != nullptr);
2863 assert(!op2->OperIsList());
2864
2865 switch (category)
2866 {
2867 case HW_Category_SimpleSIMD:
2868 case HW_Category_SIMDScalar:
2869 case HW_Category_Scalar:
2870 {
2871 if (HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId))
2872 {
2873 switch (intrinsicId)
2874 {
2875 case NI_SSE_CompareLessThanOrderedScalar:
2876 case NI_SSE_CompareLessThanUnorderedScalar:
2877 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
2878 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
2879 case NI_SSE2_CompareLessThanOrderedScalar:
2880 case NI_SSE2_CompareLessThanUnorderedScalar:
2881 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
2882 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
2883 {
2884 // We need to swap the operands for CompareLessThanOrEqual
2885 node->gtOp1 = op2;
2886 node->gtOp2 = op1;
2887 op2 = op1;
2888 break;
2889 }
2890
2891 default:
2892 {
2893 // TODO-XArch-CQ: The Compare*OrderedScalar and Compare*UnorderedScalar methods
2894 // are commutative if you also inverse the intrinsic.
2895 break;
2896 }
2897 }
2898 }
2899
2900 bool supportsRegOptional = false;
2901
2902 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2903 {
2904 MakeSrcContained(node, op2);
2905 }
2906 else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) ||
2907 (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) &&
2908 IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2909 {
2910 MakeSrcContained(node, op1);
2911
2912 // Swap the operands here to make the containment checks in codegen significantly simpler
2913 node->gtOp1 = op2;
2914 node->gtOp2 = op1;
2915 }
2916 else if (supportsRegOptional)
2917 {
2918 op2->SetRegOptional();
2919
2920 // TODO-XArch-CQ: For commutative nodes, either operand can be reg-optional.
2921 // https://github.com/dotnet/coreclr/issues/6361
2922 }
2923 break;
2924 }
2925
2926 case HW_Category_IMM:
2927 {
2928 // We don't currently have any IMM intrinsics which are also commutative
2929 assert(!isCommutative);
2930 bool supportsRegOptional = false;
2931
2932 switch (intrinsicId)
2933 {
2934 case NI_SSE2_ShiftLeftLogical:
2935 case NI_SSE2_ShiftRightArithmetic:
2936 case NI_SSE2_ShiftRightLogical:
2937 case NI_AVX2_ShiftLeftLogical:
2938 case NI_AVX2_ShiftRightArithmetic:
2939 case NI_AVX2_ShiftRightLogical:
2940 {
2941 // These intrinsics can have op2 be imm or reg/mem
2942
2943 if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2))
2944 {
2945 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2946 {
2947 MakeSrcContained(node, op2);
2948 }
2949 else if (supportsRegOptional)
2950 {
2951 op2->SetRegOptional();
2952 }
2953 }
2954 break;
2955 }
2956
2957 case NI_SSE2_Shuffle:
2958 case NI_SSE2_ShuffleHigh:
2959 case NI_SSE2_ShuffleLow:
2960 case NI_AVX2_Permute4x64:
2961 {
2962 // These intrinsics have op2 as an imm and op1 as a reg/mem
2963
2964 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2965 {
2966 MakeSrcContained(node, op1);
2967 }
2968 else if (supportsRegOptional)
2969 {
2970 op1->SetRegOptional();
2971 }
2972 break;
2973 }
2974
2975 case NI_AVX_Permute:
2976 {
2977 // These intrinsics can have op2 be imm or reg/mem
2978 // They also can have op1 be reg/mem and op2 be imm
2979
2980 if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
2981 {
2982 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2983 {
2984 MakeSrcContained(node, op1);
2985 }
2986 else if (supportsRegOptional)
2987 {
2988 op1->SetRegOptional();
2989 }
2990 }
2991 else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2992 {
2993 MakeSrcContained(node, op2);
2994 }
2995 else if (supportsRegOptional)
2996 {
2997 op2->SetRegOptional();
2998 }
2999 break;
3000 }
3001
3002 case NI_AES_KeygenAssist:
3003 {
3004 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3005 {
3006 MakeSrcContained(node, op1);
3007 }
3008 else if (supportsRegOptional)
3009 {
3010 op1->SetRegOptional();
3011 }
3012 break;
3013 }
3014
3015 default:
3016 {
3017 break;
3018 }
3019 }
3020
3021 break;
3022 }
3023
3024 case HW_Category_Special:
3025 {
3026 if (intrinsicId == NI_SSE2_CompareLessThan)
3027 {
3028 bool supportsRegOptional = false;
3029
3030 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3031 {
3032 MakeSrcContained(node, op2);
3033 }
3034 else if (supportsRegOptional)
3035 {
3036 op2->SetRegOptional();
3037 }
3038 }
3039 else
3040 {
3041 unreached();
3042 }
3043 break;
3044 }
3045
3046 default:
3047 {
3048 unreached();
3049 break;
3050 }
3051 }
3052 }
3053 else if (numArgs == 3)
3054 {
3055 // three argument intrinsics should not be marked commutative
3056 assert(!isCommutative);
3057
3058 assert(op1->OperIsList());
3059 assert(op2 == nullptr);
3060
3061 GenTreeArgList* argList = op1->AsArgList();
3062 GenTreeArgList* originalArgList = argList;
3063
3064 op1 = argList->Current();
3065 argList = argList->Rest();
3066
3067 op2 = argList->Current();
3068 argList = argList->Rest();
3069
3070 op3 = argList->Current();
3071 assert(argList->Rest() == nullptr);
3072
3073 switch (category)
3074 {
3075 case HW_Category_SimpleSIMD:
3076 case HW_Category_SIMDScalar:
3077 case HW_Category_Scalar:
3078 {
3079 if ((intrinsicId >= NI_FMA_MultiplyAdd) && (intrinsicId <= NI_FMA_MultiplySubtractNegatedScalar))
3080 {
3081 bool supportsRegOptional = false;
3082
3083 if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional))
3084 {
3085 // 213 form: op1 = (op2 * op1) + [op3]
3086 MakeSrcContained(node, op3);
3087 }
3088 else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3089 {
3090 // 132 form: op1 = (op1 * op3) + [op2]
3091 MakeSrcContained(node, op2);
3092 }
3093 else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3094 {
3095 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
3096
3097 if (!HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
3098 {
3099 // 231 form: op3 = (op2 * op3) + [op1]
3100 MakeSrcContained(node, op1);
3101 }
3102 }
3103 else
3104 {
3105 assert(supportsRegOptional);
3106
3107 // TODO-XArch-CQ: Technically any one of the three operands can
3108 // be reg-optional. With a limitation on op1 where
3109 // it can only be so if CopyUpperBits is off.
3110 // https://github.com/dotnet/coreclr/issues/6361
3111
3112 // 213 form: op1 = (op2 * op1) + op3
3113 op3->SetRegOptional();
3114 }
3115 }
3116 else
3117 {
3118 bool supportsRegOptional = false;
3119
3120 switch (intrinsicId)
3121 {
3122 case NI_SSE41_BlendVariable:
3123 case NI_AVX_BlendVariable:
3124 case NI_AVX2_BlendVariable:
3125 {
3126 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3127 {
3128 MakeSrcContained(node, op2);
3129 }
3130 else if (supportsRegOptional)
3131 {
3132 op2->SetRegOptional();
3133 }
3134 break;
3135 }
3136
3137 case NI_BMI2_MultiplyNoFlags:
3138 case NI_BMI2_X64_MultiplyNoFlags:
3139 {
3140 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3141 {
3142 MakeSrcContained(node, op2);
3143 }
3144 else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3145 {
3146 MakeSrcContained(node, op1);
3147 // MultiplyNoFlags is a Commutative operation, so swap the first two operands here
3148 // to make the containment checks in codegen significantly simpler
3149 *(originalArgList->pCurrent()) = op2;
3150 *(originalArgList->Rest()->pCurrent()) = op1;
3151 }
3152 else if (supportsRegOptional)
3153 {
3154 op2->SetRegOptional();
3155 }
3156 break;
3157 }
3158
3159 default:
3160 {
3161 unreached();
3162 break;
3163 }
3164 }
3165 }
3166 }
3167
3168 case HW_Category_IMM:
3169 {
3170 bool supportsRegOptional = false;
3171
3172 switch (intrinsicId)
3173 {
3174 case NI_SSE_Shuffle:
3175 case NI_SSE2_Insert:
3176 case NI_SSE2_Shuffle:
3177 case NI_SSSE3_AlignRight:
3178 case NI_SSE41_Blend:
3179 case NI_SSE41_DotProduct:
3180 case NI_SSE41_Insert:
3181 case NI_SSE41_X64_Insert:
3182 case NI_SSE41_MultipleSumAbsoluteDifferences:
3183 case NI_AVX_Blend:
3184 case NI_AVX_Compare:
3185 case NI_AVX_CompareScalar:
3186 case NI_AVX_DotProduct:
3187 case NI_AVX_Permute2x128:
3188 case NI_AVX_Shuffle:
3189 case NI_AVX2_Blend:
3190 case NI_AVX2_MultipleSumAbsoluteDifferences:
3191 case NI_AVX2_Permute2x128:
3192 case NI_PCLMULQDQ_CarrylessMultiply:
3193 {
3194 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3195 {
3196 MakeSrcContained(node, op2);
3197 }
3198 else if (supportsRegOptional)
3199 {
3200 op2->SetRegOptional();
3201 }
3202 break;
3203 }
3204
3205 default:
3206 {
3207 break;
3208 }
3209 }
3210
3211 break;
3212 }
3213
3214 default:
3215 {
3216 unreached();
3217 break;
3218 }
3219 }
3220 }
3221 else
3222 {
3223 unreached();
3224 }
3225
3226 if (HWIntrinsicInfo::lookupCategory(intrinsicId) == HW_Category_IMM)
3227 {
3228 GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
3229 assert(lastOp != nullptr);
3230
3231 if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
3232 {
3233 MakeSrcContained(node, lastOp);
3234 }
3235 }
3236 }
3237}
3238#endif // FEATURE_HW_INTRINSICS
3239
3240//------------------------------------------------------------------------
3241// ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
3242//
3243// Arguments:
3244// node - pointer to the node
3245//
3246void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
3247{
3248 assert(node->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV) && varTypeIsFloating(node));
3249
3250 // overflow operations aren't supported on float/double types.
3251 assert(!node->gtOverflowEx());
3252
3253 GenTree* op1 = node->gtGetOp1();
3254 GenTree* op2 = node->gtGetOp2();
3255
3256 // No implicit conversions at this stage as the expectation is that
3257 // everything is made explicit by adding casts.
3258 assert(op1->TypeGet() == op2->TypeGet());
3259
3260 bool isSafeToContainOp1 = true;
3261 bool isSafeToContainOp2 = true;
3262
3263 if (op2->IsCnsNonZeroFltOrDbl())
3264 {
3265 MakeSrcContained(node, op2);
3266 }
3267 else if (IsContainableMemoryOp(op2))
3268 {
3269 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
3270 if (isSafeToContainOp2)
3271 {
3272 MakeSrcContained(node, op2);
3273 }
3274 }
3275
3276 if (!op2->isContained() && node->OperIsCommutative())
3277 {
3278 // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
3279 // as long as it is safe so that the following efficient code sequence is generated:
3280 // addss/sd targetReg, memOp (if op1Reg == targetReg) OR
3281 // movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
3282 //
3283 // Instead of
3284 // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
3285 // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
3286
3287 if (op1->IsCnsNonZeroFltOrDbl())
3288 {
3289 MakeSrcContained(node, op1);
3290 }
3291 else if (IsContainableMemoryOp(op1))
3292 {
3293 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
3294 if (isSafeToContainOp1)
3295 {
3296 MakeSrcContained(node, op1);
3297 }
3298 }
3299 }
3300
3301 if (!op1->isContained() && !op2->isContained())
3302 {
3303 // If there are no containable operands, we can make an operand reg optional.
3304 // IsSafeToContainMem is expensive so we call it at most once for each operand
3305 // in this method. If we already called IsSafeToContainMem, it must have returned false;
3306 // otherwise, the corresponding operand (op1 or op2) would be contained.
3307 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
3308 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
3309 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
3310 }
3311}
3312
3313#endif // _TARGET_XARCH_
3314