lowerxarch.cpp source code [CoreCLR/jit/lowerxarch.cpp]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4
5	/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*
6	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7	XX XX
8	XX Lowering for AMD64, x86 XX
9	XX XX
10	XX This encapsulates all the logic for lowering trees for the AMD64 XX
11	XX architecture. For a more detailed view of what is lowering, please XX
12	XX take a look at Lower.cpp XX
13	XX XX
14	XX XX
15	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
17	*/
18
19	#include "jitpch.h"
20	#ifdef _MSC_VER
21	#pragma hdrstop
22	#endif
23
24	#ifdef _TARGET_XARCH_ // This file is only used for xarch
25
26	#include "jit.h"
27	#include "sideeffects.h"
28	#include "lower.h"
29
30	// xarch supports both ROL and ROR instructions so no lowering is required.
31	void Lowering::LowerRotate(GenTree* tree)
32	{
33	ContainCheckShiftRotate(tree->AsOp());
34	}
35
36	//------------------------------------------------------------------------
37	// LowerStoreLoc: Lower a store of a lclVar
38	//
39	// Arguments:
40	// storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
41	//
42	// Notes:
43	// This involves:
44	// - Handling of contained immediates.
45	// - Widening operations of unsigneds.
46
47	void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
48	{
49	// Try to widen the ops if they are going into a local var.
50	if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
51	{
52	GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
53	ssize_t ival = con->gtIconVal;
54
55	unsigned varNum = storeLoc->gtLclNum;
56	LclVarDsc* varDsc = comp->lvaTable + varNum;
57
58	if (varDsc->lvIsSIMDType())
59	{
60	noway_assert(storeLoc->gtType != TYP_STRUCT);
61	}
62	unsigned size = genTypeSize(storeLoc);
63	// If we are storing a constant into a local variable
64	// we extend the size of the store here
65	if ((size < `4`) && !varTypeIsStruct(varDsc))
66	{
67	if (!varTypeIsUnsigned(varDsc))
68	{
69	if (genTypeSize(storeLoc) == `1`)
70	{
71	if ((ival & `0x7f`) != ival)
72	{
73	ival = ival \| `0xffffff00`;
74	}
75	}
76	else
77	{
78	assert(genTypeSize(storeLoc) == `2`);
79	if ((ival & `0x7fff`) != ival)
80	{
81	ival = ival \| `0xffff0000`;
82	}
83	}
84	}
85
86	// A local stack slot is at least 4 bytes in size, regardless of
87	// what the local var is typed as, so auto-promote it here
88	// unless it is a field of a promoted struct
89	// TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
90	if (!varDsc->lvIsStructField)
91	{
92	storeLoc->gtType = TYP_INT;
93	con->SetIconValue(ival);
94	}
95	}
96	}
97	if (storeLoc->OperIs(GT_STORE_LCL_FLD))
98	{
99	// We should only encounter this for lclVars that are lvDoNotEnregister.
100	verifyLclFldDoNotEnregister(storeLoc->gtLclNum);
101	}
102	ContainCheckStoreLoc(storeLoc);
103	}
104
105	//------------------------------------------------------------------------
106	// LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
107	//
108	// Arguments:
109	// node - The indirect store node (GT_STORE_IND) of interest
110	//
111	// Return Value:
112	// None.
113	//
114	void Lowering::LowerStoreIndir(GenTreeIndir* node)
115	{
116	// Mark all GT_STOREIND nodes to indicate that it is not known
117	// whether it represents a RMW memory op.
118	node->AsStoreInd()->SetRMWStatusDefault();
119
120	if (!varTypeIsFloating(node))
121	{
122	// Perform recognition of trees with the following structure:
123	// StoreInd(addr, BinOp(expr, GT_IND(addr)))
124	// to be able to fold this into an instruction of the form
125	// BINOP [addr], register
126	// where register is the actual place where 'expr' is computed.
127	//
128	// SSE2 doesn't support RMW form of instructions.
129	if (LowerRMWMemOp(node))
130	{
131	return;
132	}
133	}
134	ContainCheckStoreIndir(node);
135	}
136
137	//------------------------------------------------------------------------
138	// LowerBlockStore: Set block store type
139	//
140	// Arguments:
141	// blkNode - The block store node of interest
142	//
143	// Return Value:
144	// None.
145	//
146	void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
147	{
148	GenTree* dstAddr = blkNode->Addr();
149	unsigned size = blkNode->gtBlkSize;
150	GenTree* source = blkNode->Data();
151	Compiler* compiler = comp;
152	GenTree* srcAddrOrFill = nullptr;
153	bool isInitBlk = blkNode->OperIsInitBlkOp();
154
155	if (!isInitBlk)
156	{
157	// CopyObj or CopyBlk
158	if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == `0`) \|\| blkNode->gtBlkOpGcUnsafe))
159	{
160	blkNode->SetOper(GT_STORE_BLK);
161	}
162	if (source->gtOper == GT_IND)
163	{
164	srcAddrOrFill = blkNode->Data()->gtGetOp1();
165	}
166	}
167
168	if (isInitBlk)
169	{
170	GenTree* initVal = source;
171	if (initVal->OperIsInitVal())
172	{
173	initVal->SetContained();
174	initVal = initVal->gtGetOp1();
175	}
176	srcAddrOrFill = initVal;
177	// If we have an InitBlk with constant block size we can optimize several ways:
178	// a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
179	// we use rep stosb since this reduces the register pressure in LSRA and we have
180	// roughly the same performance as calling the helper.
181	// b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
182	// we can speed this up by unrolling the loop using SSE2 stores. The reason for
183	// this threshold is because our last investigation (Fall 2013), more than 95% of initblks
184	// in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
185	// preferred code sequence for the vast majority of cases.
186
187	// This threshold will decide from using the helper or let the JIT decide to inline
188	// a code sequence of its choice.
189	unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
190
191	// TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
192	if (size != `0` && size <= helperThreshold)
193	{
194	// Always favor unrolling vs rep stos.
195	if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
196	{
197	// The fill value of an initblk is interpreted to hold a
198	// value of (unsigned int8) however a constant of any size
199	// may practically reside on the evaluation stack. So extract
200	// the lower byte out of the initVal constant and replicate
201	// it to a larger constant whose size is sufficient to support
202	// the largest width store of the desired inline expansion.
203
204	ssize_t fill = initVal->gtIntCon.gtIconVal & `0xFF`;
205	#ifdef _TARGET_AMD64_
206	if (size < REGSIZE_BYTES)
207	{
208	initVal->gtIntCon.gtIconVal = `0x01010101` * fill;
209	}
210	else
211	{
212	initVal->gtIntCon.gtIconVal = `0x0101010101010101LL` * fill;
213	initVal->gtType = TYP_LONG;
214	if ((fill == `0`) && ((size & `0xf`) == `0`))
215	{
216	MakeSrcContained(blkNode, source);
217	}
218	}
219	#else // !_TARGET_AMD64_
220	initVal->gtIntCon.gtIconVal = `0x01010101` * fill;
221	#endif // !_TARGET_AMD64_
222
223	if ((fill == `0`) && ((size & `0xf`) == `0`))
224	{
225	MakeSrcContained(blkNode, source);
226	}
227	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
228	}
229	else
230	{
231	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
232	}
233	}
234	else
235	{
236	#ifdef _TARGET_AMD64_
237	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
238	#else // !_TARGET_AMD64_
239	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
240	#endif // !_TARGET_AMD64_
241	}
242	}
243	else
244	{
245	if (blkNode->gtOper == GT_STORE_OBJ)
246	{
247	// CopyObj
248
249	GenTreeObj* cpObjNode = blkNode->AsObj();
250
251	unsigned slots = cpObjNode->gtSlots;
252
253	#ifdef DEBUG
254	// CpObj must always have at least one GC-Pointer as a member.
255	assert(cpObjNode->gtGcPtrCount > `0`);
256
257	assert(dstAddr->gtType == TYP_BYREF \|\| dstAddr->gtType == TYP_I_IMPL);
258
259	CORINFO_CLASS_HANDLE clsHnd = cpObjNode->gtClass;
260	size_t classSize = comp->info.compCompHnd->getClassSize(clsHnd);
261	size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
262
263	// Currently, the EE always round up a class data structure so
264	// we are not handling the case where we have a non multiple of pointer sized
265	// struct. This behavior may change in the future so in order to keeps things correct
266	// let's assert it just to be safe. Going forward we should simply
267	// handle this case.
268	assert(classSize == blkSize);
269	assert((blkSize / TARGET_POINTER_SIZE) == slots);
270	assert(cpObjNode->HasGCPtr());
271	#endif
272
273	bool IsRepMovsProfitable = false;
274
275	// If the destination is not on the stack, let's find out if we
276	// can improve code size by using rep movsq instead of generating
277	// sequences of movsq instructions.
278	if (!dstAddr->OperIsLocalAddr())
279	{
280	// Let's inspect the struct/class layout and determine if it's profitable
281	// to use rep movsq for copying non-gc memory instead of using single movsq
282	// instructions for each memory slot.
283	unsigned i = `0`;
284	BYTE* gcPtrs = cpObjNode->gtGcPtrs;
285
286	do
287	{
288	unsigned nonGCSlots = `0`;
289	// Measure a contiguous non-gc area inside the struct and note the maximum.
290	while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
291	{
292	nonGCSlots++;
293	i++;
294	}
295
296	while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
297	{
298	i++;
299	}
300
301	if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
302	{
303	IsRepMovsProfitable = true;
304	break;
305	}
306	} while (i < slots);
307	}
308	else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
309	{
310	IsRepMovsProfitable = true;
311	}
312
313	// There are two cases in which we need to materialize the
314	// struct size:
315	// a) When the destination is on the stack we don't need to use the
316	// write barrier, we can just simply call rep movsq and get a win in codesize.
317	// b) If we determine we have contiguous non-gc regions in the struct where it's profitable
318	// to use rep movsq instead of a sequence of single movsq instructions. According to the
319	// Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
320	// the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
321	if (IsRepMovsProfitable)
322	{
323	// We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
324	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
325	}
326	else
327	{
328	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
329	}
330	}
331	else
332	{
333	assert((blkNode->OperGet() == GT_STORE_BLK) \|\| (blkNode->OperGet() == GT_STORE_DYN_BLK));
334	// CopyBlk
335	// In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
336	// we can use rep movs to generate code instead of the helper call.
337
338	// This threshold will decide between using the helper or let the JIT decide to inline
339	// a code sequence of its choice.
340	unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
341
342	// TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
343	if ((size != `0`) && (size <= helperThreshold))
344	{
345	// If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
346	// Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
347	// our framework assemblies, so this is the main code generation scheme we'll use.
348	if (size <= CPBLK_UNROLL_LIMIT)
349	{
350	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
351
352	// If src or dst are on stack, we don't have to generate the address
353	// into a register because it's just some constant+SP.
354	if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
355	{
356	MakeSrcContained(blkNode, srcAddrOrFill);
357	}
358
359	if (dstAddr->OperIsLocalAddr())
360	{
361	MakeSrcContained(blkNode, dstAddr);
362	}
363	}
364	else
365	{
366	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
367	}
368	}
369	#ifdef _TARGET_AMD64_
370	else
371	{
372	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
373	}
374	#elif defined(_TARGET_X86_)
375	else
376	{
377	blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
378	}
379	#endif // _TARGET_X86_
380	assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
381	}
382
383	// CopyObj or CopyBlk
384	if (source->gtOper == GT_IND)
385	{
386	// The GT_IND is contained, but the address must be in a register unless it is local.
387	MakeSrcContained(blkNode, source);
388	GenTree* addr = source->AsIndir()->Addr();
389	if (!addr->OperIsLocalAddr())
390	{
391	addr->ClearContained();
392	}
393	}
394	else if (!source->IsMultiRegCall() && !source->OperIsSIMD() && !source->OperIsSimdHWIntrinsic())
395	{
396	assert(source->IsLocal());
397	MakeSrcContained(blkNode, source);
398	}
399	}
400	}
401
402	//------------------------------------------------------------------------
403	// LowerPutArgStk: Lower a GT_PUTARG_STK.
404	//
405	// Arguments:
406	// tree - The node of interest
407	//
408	// Return Value:
409	// None.
410	//
411	void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
412	{
413	#ifdef _TARGET_X86_
414	if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
415	{
416	putArgStk->gtNumberReferenceSlots = `0`;
417	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
418
419	GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
420
421	// The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
422	// of uses is visible to LSRA.
423	unsigned fieldCount = `0`;
424	GenTreeFieldList* head = nullptr;
425	for (GenTreeFieldList current = fieldList, next; current != nullptr; current = next)
426	{
427	next = current->Rest();
428
429	// First, insert the field node into the sorted list.
430	GenTreeFieldList* prev = nullptr;
431	for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
432	{
433	// If the offset of the current list node is greater than the offset of the cursor or if we have
434	// reached the end of the list, insert the current node before the cursor and terminate.
435	if ((cursor == nullptr) \|\| (current->gtFieldOffset > cursor->gtFieldOffset))
436	{
437	if (prev == nullptr)
438	{
439	assert(cursor == head);
440	head = current;
441	}
442	else
443	{
444	prev->Rest() = current;
445	}
446
447	current->Rest() = cursor;
448	break;
449	}
450	}
451
452	fieldCount++;
453	}
454
455	// In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
456	// collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
457	// lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
458	// corresponding field list nodes in two, giving an upper bound of 8.
459	//
460	// The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
461	// the maximum size of a field list grows significantly, we will need to reevaluate it.
462	assert(fieldCount <= `8`);
463
464	// The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
465	// necessary.
466	if (head != fieldList)
467	{
468	head->gtFlags \|= GTF_FIELD_LIST_HEAD;
469	head->SetContained();
470
471	fieldList->ClearContained();
472	fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
473
474	#ifdef DEBUG
475	head->gtSeqNum = fieldList->gtSeqNum;
476	#endif // DEBUG
477
478	BlockRange().InsertAfter(fieldList, head);
479	BlockRange().Remove(fieldList);
480
481	fieldList = head;
482	putArgStk->gtOp1 = fieldList;
483	putArgStk->gtType = fieldList->gtType;
484	}
485
486	// Now that the fields have been sorted, the kind of code we will generate.
487	bool allFieldsAreSlots = true;
488	unsigned prevOffset = putArgStk->getArgSize();
489	for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
490	{
491	GenTree* const fieldNode = current->Current();
492	const var_types fieldType = fieldNode->TypeGet();
493	const unsigned fieldOffset = current->gtFieldOffset;
494	assert(fieldType != TYP_LONG);
495
496	// We can treat as a slot any field that is stored at a slot boundary, where the previous
497	// field is not in the same slot. (Note that we store the fields in reverse order.)
498	const bool fieldIsSlot = ((fieldOffset % `4`) == `0`) && ((prevOffset - fieldOffset) >= `4`);
499	if (!fieldIsSlot)
500	{
501	allFieldsAreSlots = false;
502	}
503
504	if (varTypeIsGC(fieldType))
505	{
506	putArgStk->gtNumberReferenceSlots++;
507	}
508
509	// For x86 we must mark all integral fields as contained or reg-optional, and handle them
510	// accordingly in code generation, since we may have up to 8 fields, which cannot all be in
511	// registers to be consumed atomically by the call.
512	if (varTypeIsIntegralOrI(fieldNode))
513	{
514	if (fieldNode->OperGet() == GT_LCL_VAR)
515	{
516	LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
517	if (!varDsc->lvDoNotEnregister)
518	{
519	fieldNode->SetRegOptional();
520	}
521	else
522	{
523	MakeSrcContained(putArgStk, fieldNode);
524	}
525	}
526	else if (fieldNode->IsIntCnsFitsInI32())
527	{
528	MakeSrcContained(putArgStk, fieldNode);
529	}
530	else
531	{
532	// For the case where we cannot directly push the value, if we run out of registers,
533	// it would be better to defer computation until we are pushing the arguments rather
534	// than spilling, but this situation is not all that common, as most cases of promoted
535	// structs do not have a large number of fields, and of those most are lclVars or
536	// copy-propagated constants.
537	fieldNode->SetRegOptional();
538	}
539	}
540
541	prevOffset = fieldOffset;
542	}
543
544	// Set the copy kind.
545	// TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
546	// adjust the stack once for those fields. The latter is really best done in code generation, but
547	// this tuning should probably be undertaken as a whole.
548	// Also, if there are floating point fields, it may be better to use the "Unroll" mode
549	// of copying the struct as a whole, if the fields are not register candidates.
550	if (allFieldsAreSlots)
551	{
552	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
553	}
554	else
555	{
556	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
557	}
558	return;
559	}
560	#endif // _TARGET_X86_
561
562	GenTree* src = putArgStk->gtOp1;
563
564	#ifdef FEATURE_PUT_STRUCT_ARG_STK
565	if (src->TypeGet() != TYP_STRUCT)
566	#endif // FEATURE_PUT_STRUCT_ARG_STK
567	{
568	// If the child of GT_PUTARG_STK is a constant, we don't need a register to
569	// move it to memory (stack location).
570	//
571	// On AMD64, we don't want to make 0 contained, because we can generate smaller code
572	// by zeroing a register and then storing it. E.g.:
573	// xor rdx, rdx
574	// mov gword ptr [rsp+28H], rdx
575	// is 2 bytes smaller than:
576	// mov gword ptr [rsp+28H], 0
577	//
578	// On x86, we push stack arguments; we don't use 'mov'. So:
579	// push 0
580	// is 1 byte smaller than:
581	// xor rdx, rdx
582	// push rdx
583
584	if (IsContainableImmed(putArgStk, src)
585	#if defined(_TARGET_AMD64_)
586	&& !src->IsIntegralConst(`0`)
587	#endif // _TARGET_AMD64_
588	)
589	{
590	MakeSrcContained(putArgStk, src);
591	}
592	return;
593	}
594
595	#ifdef FEATURE_PUT_STRUCT_ARG_STK
596	GenTree* dst = putArgStk;
597	GenTree* srcAddr = nullptr;
598
599	bool haveLocalAddr = false;
600	if ((src->OperGet() == GT_OBJ) \|\| (src->OperGet() == GT_IND))
601	{
602	srcAddr = src->gtOp.gtOp1;
603	assert(srcAddr != nullptr);
604	haveLocalAddr = srcAddr->OperIsLocalAddr();
605	}
606	else
607	{
608	assert(varTypeIsSIMD(putArgStk));
609	}
610
611	// In case of a CpBlk we could use a helper call. In case of putarg_stk we
612	// can't do that since the helper call could kill some already set up outgoing args.
613	// TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
614	// The cpyXXXX code is rather complex and this could cause it to be more complex, but
615	// it might be the right thing to do.
616
617	// This threshold will decide from using the helper or let the JIT decide to inline
618	// a code sequence of its choice.
619	ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
620	ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
621
622	// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
623	// (I don't know which).
624
625	// If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
626	// Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
627	// our framework assemblies, so this is the main code generation scheme we'll use.
628	if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == `0`)
629	{
630	#ifdef _TARGET_X86_
631	if (size < XMM_REGSIZE_BYTES)
632	{
633	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
634	}
635	else
636	#endif // _TARGET_X86_
637	{
638	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
639	}
640	}
641	#ifdef _TARGET_X86_
642	else if (putArgStk->gtNumberReferenceSlots != `0`)
643	{
644	// On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
645	// the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
646	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
647	}
648	#endif // _TARGET_X86_
649	else
650	{
651	putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
652	}
653	// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
654	MakeSrcContained(putArgStk, src);
655	if (haveLocalAddr)
656	{
657	// If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
658	// copies.
659	//
660	MakeSrcContained(putArgStk, srcAddr);
661	}
662	#endif // FEATURE_PUT_STRUCT_ARG_STK
663	}
664
665	/ Lower GT_CAST(srcType, DstType) nodes.*
666	*
667	* Casts from small int type to float/double are transformed as follows:
668	* GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double)
669	* GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double)
670	* GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
671	* GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
672	*
673	* SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
674	* are morphed as follows by front-end and hence should not be seen here.
675	* GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
676	* GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
677	*
678	*
679	* Similarly casts from float/double to a smaller int type are transformed as follows:
680	* GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte)
681	* GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte)
682	* GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
683	* GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
684	*
685	* SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
686	* integer. The above transformations help us to leverage those instructions.
687	*
688	* Note that for the following conversions we still depend on helper calls and
689	* don't expect to see them here.
690	* i) GT_CAST(float/double, uint64)
691	* ii) GT_CAST(float/double, int type with overflow detection)
692	*
693	* TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
694	* There are hardly any occurrences of this conversion operation in platform
695	* assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
696	* 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
697	* system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
698	* doing this optimization is a win, should consider generating in-lined code.
699	*/
700	void Lowering::LowerCast(GenTree* tree)
701	{
702	assert(tree->OperGet() == GT_CAST);
703
704	GenTree* castOp = tree->gtCast.CastOp();
705	var_types castToType = tree->CastToType();
706	var_types srcType = castOp->TypeGet();
707	var_types tmpType = TYP_UNDEF;
708
709	// force the srcType to unsigned if GT_UNSIGNED flag is set
710	if (tree->gtFlags & GTF_UNSIGNED)
711	{
712	srcType = genUnsignedType(srcType);
713	}
714
715	// We should never see the following casts as they are expected to be lowered
716	// apropriately or converted into helper calls by front-end.
717	// srcType = float/double castToType = and overflow detecting cast*
718	// Reason: must be converted to a helper call
719	// srcType = float/double, castToType = ulong
720	// Reason: must be converted to a helper call
721	// srcType = uint castToType = float/double
722	// Reason: uint -> float/double = uint -> long -> float/double
723	// srcType = ulong castToType = float
724	// Reason: ulong -> float = ulong -> double -> float
725	if (varTypeIsFloating(srcType))
726	{
727	noway_assert(!tree->gtOverflow());
728	noway_assert(castToType != TYP_ULONG);
729	}
730	else if (srcType == TYP_UINT)
731	{
732	noway_assert(!varTypeIsFloating(castToType));
733	}
734	else if (srcType == TYP_ULONG)
735	{
736	noway_assert(castToType != TYP_FLOAT);
737	}
738
739	// Case of src is a small type and dst is a floating point type.
740	if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
741	{
742	// These conversions can never be overflow detecting ones.
743	noway_assert(!tree->gtOverflow());
744	tmpType = TYP_INT;
745	}
746	// case of src is a floating point type and dst is a small type.
747	else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
748	{
749	tmpType = TYP_INT;
750	}
751
752	if (tmpType != TYP_UNDEF)
753	{
754	GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
755	tmp->gtFlags \|= (tree->gtFlags & (GTF_OVERFLOW \| GTF_EXCEPT));
756
757	tree->gtFlags &= ~GTF_UNSIGNED;
758	tree->gtOp.gtOp1 = tmp;
759	BlockRange().InsertAfter(castOp, tmp);
760	ContainCheckCast(tmp->AsCast());
761	}
762
763	// Now determine if we have operands that should be contained.
764	ContainCheckCast(tree->AsCast());
765	}
766
767	#ifdef FEATURE_SIMD
768	//----------------------------------------------------------------------------------------------
769	// Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
770	//
771	// Arguments:
772	// simdNode - The SIMD intrinsic node.
773	//
774	void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
775	{
776	if (simdNode->TypeGet() == TYP_SIMD12)
777	{
778	// GT_SIMD node requiring to produce TYP_SIMD12 in fact
779	// produces a TYP_SIMD16 result
780	simdNode->gtType = TYP_SIMD16;
781	}
782
783	#ifdef _TARGET_XARCH_
784	if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
785	{
786	// If SIMD vector is already in memory, we force its
787	// addr to be evaluated into a reg. This would allow
788	// us to generate [regBase] or [regBase+offset] or
789	// [regBase+sizeOf(SIMD vector baseType)regIndex]*
790	// to access the required SIMD vector element directly
791	// from memory.
792	//
793	// TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
794	// might be able update GT_LEA to fold the regIndex
795	// or offset in some cases. Instead with this
796	// approach we always evaluate GT_LEA into a reg.
797	// Ideally, we should be able to lower GetItem intrinsic
798	// into GT_IND(newAddr) where newAddr combines
799	// the addr of SIMD vector with the given index.
800	simdNode->gtOp1->gtFlags \|= GTF_IND_REQ_ADDR_IN_REG;
801	}
802	else if (simdNode->IsSIMDEqualityOrInequality())
803	{
804	LIR::Use simdUse;
805
806	if (BlockRange().TryGetUse(simdNode, &simdUse))
807	{
808	//
809	// Try to transform JTRUE(EQ\|NE(SIMD<OpEquality\|OpInEquality>(x, y), 0\|1)) into
810	// JCC(SIMD<OpEquality\|OpInEquality>(x, y)). SIMD<OpEquality\|OpInEquality>(x, y)
811	// is expected to set the Zero flag appropriately.
812	// All the involved nodes must form a continuous range, there's no other way to
813	// guarantee that condition flags aren't changed between the SIMD node and the JCC
814	// node.
815	//
816
817	bool transformed = false;
818	GenTree* simdUser = simdUse.User();
819
820	if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() &&
821	(simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser))
822	{
823	ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue();
824
825	if ((relopOp2Value == `0`) \|\| (relopOp2Value == `1`))
826	{
827	GenTree* jtrue = simdUser->gtNext;
828
829	if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser))
830	{
831	if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ))
832	{
833	relopOp2Value ^= `1`;
834	}
835
836	jtrue->ChangeOper(GT_JCC);
837	GenTreeCC* jcc = jtrue->AsCC();
838	jcc->gtFlags \|= GTF_USE_FLAGS;
839	jcc->gtCondition = (relopOp2Value == `0`) ? GT_NE : GT_EQ;
840
841	BlockRange().Remove(simdUser->gtGetOp2());
842	BlockRange().Remove(simdUser);
843	transformed = true;
844	}
845	}
846	}
847
848	if (!transformed)
849	{
850	//
851	// The code generated for SIMD SIMD<OpEquality\|OpInEquality>(x, y) nodes sets
852	// the Zero flag like integer compares do so we can simply use SETCC<EQ\|NE>
853	// to produce the desired result. This avoids the need for subsequent phases
854	// to have to handle 2 cases (set flags/set destination register).
855	//
856
857	genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE;
858	GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC (GT_SETCC, condition, simdNode->TypeGet());
859	setcc->gtFlags \|= GTF_USE_FLAGS;
860	BlockRange().InsertAfter(simdNode, setcc);
861	simdUse.ReplaceWith(comp, setcc);
862	}
863	}
864
865	simdNode->gtFlags \|= GTF_SET_FLAGS;
866	simdNode->gtType = TYP_VOID;
867	}
868	#endif
869	ContainCheckSIMD(simdNode);
870	}
871	#endif // FEATURE_SIMD
872
873	#ifdef FEATURE_HW_INTRINSICS
874	//----------------------------------------------------------------------------------------------
875	// Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
876	//
877	// Arguments:
878	// node - The hardware intrinsic node.
879	//
880	void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
881	{
882	ContainCheckHWIntrinsic(node);
883	}
884	#endif // FEATURE_HW_INTRINSICS
885
886	//----------------------------------------------------------------------------------------------
887	// Lowering::IsRMWIndirCandidate:
888	// Returns true if the given operand is a candidate indirection for a read-modify-write
889	// operator.
890	//
891	// Arguments:
892	// operand - The operand to consider.
893	// storeInd - The indirect store that roots the possible RMW operator.
894	//
895	bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
896	{
897	// If the operand isn't an indirection, it's trivially not a candidate.
898	if (operand->OperGet() != GT_IND)
899	{
900	return false;
901	}
902
903	// If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
904	// indirection is not a candidate.
905	GenTree* srcAddr = operand->gtGetOp1();
906	GenTree* dstAddr = storeInd->gtGetOp1();
907	if ((srcAddr->OperGet() != dstAddr->OperGet()) \|\| !IndirsAreEquivalent(operand, storeInd))
908	{
909	return false;
910	}
911
912	// If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
913	// candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
914	// indirection's tree is visited and check the side effects at each point.
915
916	m_scratchSideEffects.Clear();
917
918	assert((operand->gtLIRFlags & LIR::Flags::Mark) == `0`);
919	operand->gtLIRFlags \|= LIR::Flags::Mark;
920
921	unsigned markCount = `1`;
922	GenTree* node;
923	for (node = storeInd->gtPrev; markCount > `0`; node = node->gtPrev)
924	{
925	assert(node != nullptr);
926
927	if ((node->gtLIRFlags & LIR::Flags::Mark) == `0`)
928	{
929	m_scratchSideEffects.AddNode(comp, node);
930	}
931	else
932	{
933	node->gtLIRFlags &= ~LIR::Flags::Mark;
934	markCount--;
935
936	if (m_scratchSideEffects.InterferesWith(comp, node, false))
937	{
938	// The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
939	// not a candidate. Clear any leftover mark bits and return.
940	for (; markCount > `0`; node = node->gtPrev)
941	{
942	if ((node->gtLIRFlags & LIR::Flags::Mark) != `0`)
943	{
944	node->gtLIRFlags &= ~LIR::Flags::Mark;
945	markCount--;
946	}
947	}
948	return false;
949	}
950
951	node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
952	assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == `0`);
953	nodeOperand->gtLIRFlags \|= LIR::Flags::Mark;
954	markCount++;
955	return GenTree::VisitResult::Continue;
956	});
957	}
958	}
959
960	// At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
961	// destination address, and that it and the transitive closure of its operand can be safely contained by the
962	// storeIndir. This indirection is therefore a candidate for an RMW op.
963	return true;
964	}
965
966	//----------------------------------------------------------------------------------------------
967	// Returns true if this tree is bin-op of a GT_STOREIND of the following form
968	// storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
969	// storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
970	//
971	// The above form for storeInd represents a read-modify-write memory binary operation.
972	//
973	// Parameters
974	// tree - GentreePtr of binOp
975	//
976	// Return Value
977	// True if 'tree' is part of a RMW memory operation pattern
978	//
979	bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
980	{
981	// Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
982	assert(!varTypeIsFloating(tree));
983	assert(GenTree::OperIsBinary(tree->OperGet()));
984
985	// Cheap bail out check before more expensive checks are performed.
986	// RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
987	if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
988	{
989	return false;
990	}
991
992	LIR::Use use;
993	if (!BlockRange().TryGetUse(tree, &use) \|\| use.User()->OperGet() != GT_STOREIND \|\| use.User()->gtGetOp2() != tree)
994	{
995	return false;
996	}
997
998	// Since it is not relatively cheap to recognize RMW memory op pattern, we
999	// cache the result in GT_STOREIND node so that while lowering GT_STOREIND
1000	// we can use the result.
1001	GenTree* indirCandidate = nullptr;
1002	GenTree* indirOpSource = nullptr;
1003	return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
1004	}
1005
1006	//----------------------------------------------------------------------------------------------
1007	// This method recognizes the case where we have a treeNode with the following structure:
1008	// storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
1009	// storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
1010	// storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
1011	//
1012	// Terminology:
1013	// indirDst = memory write of an addr mode (i.e. storeind destination)
1014	// indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
1015	// indirCandidate = memory read i.e. a gtInd of an addr mode
1016	// indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
1017	//
1018	// In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
1019	// following form in case of integer operations:
1020	// binOp [addressing mode], RegIndirOpSource
1021	// binOp [addressing mode], immediateVal
1022	// where RegIndirOpSource is the register where indirOpSource was computed.
1023	//
1024	// Right now, we recognize few cases:
1025	// a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
1026	// b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
1027	// c) unaryOp is either not/neg
1028	//
1029	// Implementation Note: The following routines need to be in sync for RMW memory op optimization
1030	// to be correct and functional.
1031	// IndirsAreEquivalent()
1032	// NodesAreEquivalentLeaves()
1033	// Codegen of GT_STOREIND and genCodeForShiftRMW()
1034	// emitInsRMW()
1035	//
1036	// TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
1037	// package to perform more complex tree recognition.
1038	//
1039	// TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
1040	//
1041	// Parameters:
1042	// tree - GT_STOREIND node
1043	// outIndirCandidate - out param set to indirCandidate as described above
1044	// ouutIndirOpSource - out param set to indirOpSource as described above
1045	//
1046	// Return value
1047	// True if there is a RMW memory operation rooted at a GT_STOREIND tree
1048	// and out params indirCandidate and indirOpSource are set to non-null values.
1049	// Otherwise, returns false with indirCandidate and indirOpSource set to null.
1050	// Also updates flags of GT_STOREIND tree with its RMW status.
1051	//
1052	bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree outIndirCandidate, GenTree outIndirOpSource)
1053	{
1054	assert(!varTypeIsFloating(tree));
1055	assert(outIndirCandidate != nullptr);
1056	assert(outIndirOpSource != nullptr);
1057
1058	outIndirCandidate = nullptr*;
1059	outIndirOpSource = nullptr*;
1060
1061	// Early out if storeInd is already known to be a non-RMW memory op
1062	GenTreeStoreInd* storeInd = tree->AsStoreInd();
1063	if (storeInd->IsNonRMWMemoryOp())
1064	{
1065	return false;
1066	}
1067
1068	GenTree* indirDst = storeInd->gtGetOp1();
1069	GenTree* indirSrc = storeInd->gtGetOp2();
1070	genTreeOps oper = indirSrc->OperGet();
1071
1072	// Early out if it is already known to be a RMW memory op
1073	if (storeInd->IsRMWMemoryOp())
1074	{
1075	if (GenTree::OperIsBinary(oper))
1076	{
1077	if (storeInd->IsRMWDstOp1())
1078	{
1079	*outIndirCandidate = indirSrc->gtGetOp1();
1080	*outIndirOpSource = indirSrc->gtGetOp2();
1081	}
1082	else
1083	{
1084	assert(storeInd->IsRMWDstOp2());
1085	*outIndirCandidate = indirSrc->gtGetOp2();
1086	*outIndirOpSource = indirSrc->gtGetOp1();
1087	}
1088	assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
1089	}
1090	else
1091	{
1092	assert(GenTree::OperIsUnary(oper));
1093	assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
1094	*outIndirCandidate = indirSrc->gtGetOp1();
1095	*outIndirOpSource = indirSrc->gtGetOp1();
1096	}
1097
1098	return true;
1099	}
1100
1101	// If reached here means that we do not know RMW status of tree rooted at storeInd
1102	assert(storeInd->IsRMWStatusUnknown());
1103
1104	// Early out if indirDst is not one of the supported memory operands.
1105	if (indirDst->OperGet() != GT_LEA && indirDst->OperGet() != GT_LCL_VAR && indirDst->OperGet() != GT_LCL_VAR_ADDR &&
1106	indirDst->OperGet() != GT_CLS_VAR_ADDR && indirDst->OperGet() != GT_CNS_INT)
1107	{
1108	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1109	return false;
1110	}
1111
1112	// We can not use Read-Modify-Write instruction forms with overflow checking instructions
1113	// because we are not allowed to modify the target until after the overflow check.
1114	if (indirSrc->gtOverflowEx())
1115	{
1116	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1117	return false;
1118	}
1119
1120	// At this point we can match one of two patterns:
1121	//
1122	// t_ind = indir t_addr_0
1123	// ...
1124	// t_value = binop t_ind, t_other
1125	// ...
1126	// storeIndir t_addr_1, t_value
1127	//
1128	// or
1129	//
1130	// t_ind = indir t_addr_0
1131	// ...
1132	// t_value = unop t_ind
1133	// ...
1134	// storeIndir t_addr_1, t_value
1135	//
1136	// In all cases, we will eventually make the binop that produces t_value and the entire dataflow tree rooted at
1137	// t_ind contained by t_value.
1138
1139	GenTree* indirCandidate = nullptr;
1140	GenTree* indirOpSource = nullptr;
1141	RMWStatus status = STOREIND_RMW_STATUS_UNKNOWN;
1142	if (GenTree::OperIsBinary(oper))
1143	{
1144	// Return if binary op is not one of the supported operations for RMW of memory.
1145	if (!GenTree::OperIsRMWMemOp(oper))
1146	{
1147	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1148	return false;
1149	}
1150
1151	if (GenTree::OperIsShiftOrRotate(oper) && varTypeIsSmall(storeInd))
1152	{
1153	// In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
1154	// by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
1155	// will lose the sign or zero-extension bits.
1156	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
1157	return false;
1158	}
1159
1160	// In the common case, the second operand to the binop will be the indir candidate.
1161	GenTreeOp* binOp = indirSrc->AsOp();
1162	if (GenTree::OperIsCommutative(oper) && IsRMWIndirCandidate(binOp->gtOp2, storeInd))
1163	{
1164	indirCandidate = binOp->gtOp2;
1165	indirOpSource = binOp->gtOp1;
1166	status = STOREIND_RMW_DST_IS_OP2;
1167	}
1168	else if (IsRMWIndirCandidate(binOp->gtOp1, storeInd))
1169	{
1170	indirCandidate = binOp->gtOp1;
1171	indirOpSource = binOp->gtOp2;
1172	status = STOREIND_RMW_DST_IS_OP1;
1173	}
1174	else
1175	{
1176	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1177	return false;
1178	}
1179	}
1180	else if (GenTree::OperIsUnary(oper))
1181	{
1182	// Nodes other than GT_NOT and GT_NEG are not yet supported.
1183	if (oper != GT_NOT && oper != GT_NEG)
1184	{
1185	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1186	return false;
1187	}
1188
1189	if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
1190	{
1191	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1192	return false;
1193	}
1194
1195	GenTreeUnOp* unOp = indirSrc->AsUnOp();
1196	if (IsRMWIndirCandidate(unOp->gtOp1, storeInd))
1197	{
1198	// src and dest are the same in case of unary ops
1199	indirCandidate = unOp->gtOp1;
1200	indirOpSource = unOp->gtOp1;
1201	status = STOREIND_RMW_DST_IS_OP1;
1202	}
1203	else
1204	{
1205	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1206	return false;
1207	}
1208	}
1209	else
1210	{
1211	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1212	return false;
1213	}
1214
1215	// By this point we've verified that we have a supported operand with a supported address. Now we need to ensure
1216	// that we're able to move the destination address for the source indirection forwards.
1217	if (!IsSafeToContainMem(storeInd, indirDst))
1218	{
1219	storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1220	return false;
1221	}
1222
1223	assert(indirCandidate != nullptr);
1224	assert(indirOpSource != nullptr);
1225	assert(status != STOREIND_RMW_STATUS_UNKNOWN);
1226
1227	*outIndirCandidate = indirCandidate;
1228	*outIndirOpSource = indirOpSource;
1229	storeInd->SetRMWStatus(status);
1230	return true;
1231	}
1232
1233	// anything is in range for AMD64
1234	bool Lowering::IsCallTargetInRange(void* addr)
1235	{
1236	return true;
1237	}
1238
1239	// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
1240	bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
1241	{
1242	if (!childNode->IsIntCnsFitsInI32())
1243	{
1244	return false;
1245	}
1246
1247	// At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
1248	// Icons that need relocation should never be marked as contained immed
1249	if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
1250	{
1251	return false;
1252	}
1253
1254	return true;
1255	}
1256
1257	//-----------------------------------------------------------------------
1258	// PreferredRegOptionalOperand: returns one of the operands of given
1259	// binary oper that is to be preferred for marking as reg optional.
1260	//
1261	// Since only one of op1 or op2 can be a memory operand on xarch, only
1262	// one of them have to be marked as reg optional. Since Lower doesn't
1263	// know apriori which of op1 or op2 is not likely to get a register, it
1264	// has to make a guess. This routine encapsulates heuristics that
1265	// guess whether it is likely to be beneficial to mark op1 or op2 as
1266	// reg optional.
1267	//
1268	//
1269	// Arguments:
1270	// tree - a binary-op tree node that is either commutative
1271	// or a compare oper.
1272	//
1273	// Returns:
1274	// Returns op1 or op2 of tree node that is preferred for
1275	// marking as reg optional.
1276	//
1277	// Note: if the tree oper is neither commutative nor a compare oper
1278	// then only op2 can be reg optional on xarch and hence no need to
1279	// call this routine.
1280	GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
1281	{
1282	assert(GenTree::OperIsBinary(tree->OperGet()));
1283	assert(tree->OperIsCommutative() \|\| tree->OperIsCompare() \|\| tree->OperIs(GT_CMP));
1284
1285	GenTree* op1 = tree->gtGetOp1();
1286	GenTree* op2 = tree->gtGetOp2();
1287	assert(!op1->IsRegOptional() && !op2->IsRegOptional());
1288
1289	// We default to op1, as op2 is likely to have the shorter lifetime.
1290	GenTree* preferredOp = op1;
1291
1292	// This routine uses the following heuristics:
1293	//
1294	// a) If both are register candidates, marking the one with lower weighted
1295	// ref count as reg-optional would likely be beneficial as it has
1296	// higher probability of not getting a register. Note that we use !lvDoNotEnregister
1297	// here because this is being done while we are adding lclVars for Lowering.
1298	//
1299	// b) op1 = tracked local and op2 = untracked local: LSRA creates two
1300	// ref positions for op2: a def and use position. op2's def position
1301	// requires a reg and it is allocated a reg by spilling another
1302	// interval (if required) and that could be even op1. For this reason
1303	// it is beneficial to mark op1 as reg optional.
1304	//
1305	// TODO: It is not always mandatory for a def position of an untracked
1306	// local to be allocated a register if it is on rhs of an assignment
1307	// and its use position is reg-optional and has not been assigned a
1308	// register. Reg optional def positions is currently not yet supported.
1309	//
1310	// c) op1 = untracked local and op2 = tracked local: marking op1 as
1311	// reg optional is beneficial, since its use position is less likely
1312	// to get a register.
1313	//
1314	// d) If both are untracked locals (i.e. treated like tree temps by
1315	// LSRA): though either of them could be marked as reg optional,
1316	// marking op1 as reg optional is likely to be beneficial because
1317	// while allocating op2's def position, there is a possibility of
1318	// spilling op1's def and in which case op1 is treated as contained
1319	// memory operand rather than requiring to reload.
1320	//
1321	// e) If only one of them is a local var, prefer to mark it as
1322	// reg-optional. This is heuristic is based on the results
1323	// obtained against CQ perf benchmarks.
1324	//
1325	// f) If neither of them are local vars (i.e. tree temps), prefer to
1326	// mark op1 as reg optional for the same reason as mentioned in (d) above.
1327	if (op1->OperGet() == GT_LCL_VAR && op2->OperGet() == GT_LCL_VAR)
1328	{
1329	LclVarDsc* v1 = comp->lvaTable + op1->AsLclVarCommon()->GetLclNum();
1330	LclVarDsc* v2 = comp->lvaTable + op2->AsLclVarCommon()->GetLclNum();
1331
1332	bool v1IsRegCandidate = !v1->lvDoNotEnregister;
1333	bool v2IsRegCandidate = !v2->lvDoNotEnregister;
1334	if (v1IsRegCandidate && v2IsRegCandidate)
1335	{
1336	// Both are enregisterable locals. The one with lower weight is less likely
1337	// to get a register and hence beneficial to mark the one with lower
1338	// weight as reg optional.
1339	// If either is not tracked, it may be that it was introduced after liveness
1340	// was run, in which case we will always prefer op1 (should we use raw refcnt??).
1341	if (v1->lvTracked && v2->lvTracked && (v1->lvRefCntWtd() >= v2->lvRefCntWtd()))
1342	{
1343	preferredOp = op2;
1344	}
1345	}
1346	}
1347	else if (!(op1->OperGet() == GT_LCL_VAR) && (op2->OperGet() == GT_LCL_VAR))
1348	{
1349	preferredOp = op2;
1350	}
1351
1352	return preferredOp;
1353	}
1354
1355	//------------------------------------------------------------------------
1356	// Containment analysis
1357	//------------------------------------------------------------------------
1358
1359	//------------------------------------------------------------------------
1360	// ContainCheckCallOperands: Determine whether operands of a call should be contained.
1361	//
1362	// Arguments:
1363	// call - The call node of interest
1364	//
1365	// Return Value:
1366	// None.
1367	//
1368	void Lowering::ContainCheckCallOperands(GenTreeCall* call)
1369	{
1370	GenTree* ctrlExpr = call->gtControlExpr;
1371	if (call->gtCallType == CT_INDIRECT)
1372	{
1373	// either gtControlExpr != null or gtCallAddr != null.
1374	// Both cannot be non-null at the same time.
1375	assert(ctrlExpr == nullptr);
1376	assert(call->gtCallAddr != nullptr);
1377	ctrlExpr = call->gtCallAddr;
1378
1379	#ifdef _TARGET_X86_
1380	// Fast tail calls aren't currently supported on x86, but if they ever are, the code
1381	// below that handles indirect VSD calls will need to be fixed.
1382	assert(!call->IsFastTailCall() \|\| !call->IsVirtualStub());
1383	#endif // _TARGET_X86_
1384	}
1385
1386	// set reg requirements on call target represented as control sequence.
1387	if (ctrlExpr != nullptr)
1388	{
1389	// we should never see a gtControlExpr whose type is void.
1390	assert(ctrlExpr->TypeGet() != TYP_VOID);
1391
1392	// In case of fast tail implemented as jmp, make sure that gtControlExpr is
1393	// computed into a register.
1394	if (!call->IsFastTailCall())
1395	{
1396	#ifdef _TARGET_X86_
1397	// On x86, we need to generate a very specific pattern for indirect VSD calls:
1398	//
1399	// 3-byte nop
1400	// call dword ptr [eax]
1401	//
1402	// Where EAX is also used as an argument to the stub dispatch helper. Make
1403	// sure that the call target address is computed into EAX in this case.
1404	if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1405	{
1406	assert(ctrlExpr->isIndir());
1407	MakeSrcContained(call, ctrlExpr);
1408	}
1409	else
1410	#endif // _TARGET_X86_
1411	if (ctrlExpr->isIndir())
1412	{
1413	// We may have cases where we have set a register target on the ctrlExpr, but if it
1414	// contained we must clear it.
1415	ctrlExpr->gtRegNum = REG_NA;
1416	MakeSrcContained(call, ctrlExpr);
1417	}
1418	}
1419	}
1420
1421	GenTree* args = call->gtCallArgs;
1422	while (args)
1423	{
1424	GenTree* arg = args->gtOp.gtOp1;
1425	if (arg->gtOper == GT_PUTARG_STK)
1426	{
1427	LowerPutArgStk(arg->AsPutArgStk());
1428	}
1429	args = args->gtOp.gtOp2;
1430	}
1431	args = call->gtCallLateArgs;
1432	while (args)
1433	{
1434	GenTree* arg = args->gtOp.gtOp1;
1435	if (arg->gtOper == GT_PUTARG_STK)
1436	{
1437	LowerPutArgStk(arg->AsPutArgStk());
1438	}
1439	args = args->gtOp.gtOp2;
1440	}
1441	}
1442
1443	//------------------------------------------------------------------------
1444	// ContainCheckIndir: Determine whether operands of an indir should be contained.
1445	//
1446	// Arguments:
1447	// node - The indirection node of interest
1448	//
1449	// Notes:
1450	// This is called for both store and load indirections. In the former case, it is assumed that
1451	// LowerStoreIndir() has already been called to check for RMW opportunities.
1452	//
1453	// Return Value:
1454	// None.
1455	//
1456	void Lowering::ContainCheckIndir(GenTreeIndir* node)
1457	{
1458	GenTree* addr = node->Addr();
1459
1460	// If this is the rhs of a block copy it will be handled when we handle the store.
1461	if (node->TypeGet() == TYP_STRUCT)
1462	{
1463	return;
1464	}
1465
1466	#ifdef FEATURE_SIMD
1467	// If indirTree is of TYP_SIMD12, don't mark addr as contained
1468	// so that it always get computed to a register. This would
1469	// mean codegen side logic doesn't need to handle all possible
1470	// addr expressions that could be contained.
1471	//
1472	// TODO-XArch-CQ: handle other addr mode expressions that could be marked
1473	// as contained.
1474	if (node->TypeGet() == TYP_SIMD12)
1475	{
1476	return;
1477	}
1478	#endif // FEATURE_SIMD
1479
1480	if ((node->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != `0`)
1481	{
1482	// The address of an indirection that requires its address in a reg.
1483	// Skip any further processing that might otherwise make it contained.
1484	}
1485	else if ((addr->OperGet() == GT_CLS_VAR_ADDR) \|\| (addr->OperGet() == GT_LCL_VAR_ADDR))
1486	{
1487	// These nodes go into an addr mode:
1488	// - GT_CLS_VAR_ADDR turns into a constant.
1489	// - GT_LCL_VAR_ADDR is a stack addr mode.
1490
1491	// make this contained, it turns into a constant that goes into an addr mode
1492	MakeSrcContained(node, addr);
1493	}
1494	else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
1495	{
1496	// Amd64:
1497	// We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
1498	// (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
1499	// VM requires us to pass stub addr in VirtualStubParam.reg - see LowerVirtualStubCall(). For
1500	// that reason we cannot mark such an addr as contained. Note that this is not an issue for
1501	// indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
1502	// argument.
1503	//
1504	// Workaround:
1505	// Note that LowerVirtualStubCall() sets addr->gtRegNum to VirtualStubParam.reg and Lowering::doPhase()
1506	// sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA.
1507	// Ideally we should set a flag on addr nodes that shouldn't be marked as contained
1508	// (in LowerVirtualStubCall()), but we don't have any GTF_ flags left for that purpose. As a workaround*
1509	// an explicit check is made here.
1510	//
1511	// On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
1512	MakeSrcContained(node, addr);
1513	}
1514	else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(node, addr))
1515	{
1516	MakeSrcContained(node, addr);
1517	}
1518	}
1519
1520	//------------------------------------------------------------------------
1521	// ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
1522	//
1523	// Arguments:
1524	// node - pointer to the node
1525	//
1526	void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
1527	{
1528	// If the source is a containable immediate, make it contained, unless it is
1529	// an int-size or larger store of zero to memory, because we can generate smaller code
1530	// by zeroing a register and then storing it.
1531	GenTree* src = node->gtOp.gtOp2;
1532	if (IsContainableImmed(node, src) &&
1533	(!src->IsIntegralConst(`0`) \|\| varTypeIsSmall(node) \|\| node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
1534	{
1535	MakeSrcContained(node, src);
1536	}
1537	ContainCheckIndir(node);
1538	}
1539
1540	//------------------------------------------------------------------------
1541	// ContainCheckMul: determine whether the sources of a MUL node should be contained.
1542	//
1543	// Arguments:
1544	// node - pointer to the node
1545	//
1546	void Lowering::ContainCheckMul(GenTreeOp* node)
1547	{
1548	#if defined(_TARGET_X86_)
1549	assert(node->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
1550	#else
1551	assert(node->OperIs(GT_MUL, GT_MULHI));
1552	#endif
1553
1554	// Case of float/double mul.
1555	if (varTypeIsFloating(node->TypeGet()))
1556	{
1557	ContainCheckFloatBinary(node);
1558	return;
1559	}
1560
1561	GenTree* op1 = node->gtOp.gtOp1;
1562	GenTree* op2 = node->gtOp.gtOp2;
1563
1564	bool isSafeToContainOp1 = true;
1565	bool isSafeToContainOp2 = true;
1566
1567	bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != `0`);
1568	bool requiresOverflowCheck = node->gtOverflowEx();
1569	bool useLeaEncoding = false;
1570	GenTree* memOp = nullptr;
1571
1572	bool hasImpliedFirstOperand = false;
1573	GenTreeIntConCommon* imm = nullptr;
1574	GenTree* other = nullptr;
1575
1576	// Multiply should never be using small types
1577	assert(!varTypeIsSmall(node->TypeGet()));
1578
1579	// We do use the widening multiply to implement
1580	// the overflow checking for unsigned multiply
1581	//
1582	if (isUnsignedMultiply && requiresOverflowCheck)
1583	{
1584	hasImpliedFirstOperand = true;
1585	}
1586	else if (node->OperGet() == GT_MULHI)
1587	{
1588	hasImpliedFirstOperand = true;
1589	}
1590	#if defined(_TARGET_X86_)
1591	else if (node->OperGet() == GT_MUL_LONG)
1592	{
1593	hasImpliedFirstOperand = true;
1594	}
1595	#endif
1596	else if (IsContainableImmed(node, op2) \|\| IsContainableImmed(node, op1))
1597	{
1598	if (IsContainableImmed(node, op2))
1599	{
1600	imm = op2->AsIntConCommon();
1601	other = op1;
1602	}
1603	else
1604	{
1605	imm = op1->AsIntConCommon();
1606	other = op2;
1607	}
1608
1609	// CQ: We want to rewrite this into a LEA
1610	ssize_t immVal = imm->AsIntConCommon()->IconValue();
1611	if (!requiresOverflowCheck && (immVal == `3` \|\| immVal == `5` \|\| immVal == `9`))
1612	{
1613	useLeaEncoding = true;
1614	}
1615
1616	MakeSrcContained(node, imm); // The imm is always contained
1617	if (IsContainableMemoryOp(other))
1618	{
1619	memOp = other; // memOp may be contained below
1620	}
1621	}
1622
1623	// We allow one operand to be a contained memory operand.
1624	// The memory op type must match with the 'node' type.
1625	// This is because during codegen we use 'node' type to derive EmitTypeSize.
1626	// E.g op1 type = byte, op2 type = byte but GT_MUL node type is int.
1627	//
1628	if (memOp == nullptr)
1629	{
1630	if ((op2->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op2))
1631	{
1632	isSafeToContainOp2 = IsSafeToContainMem(node, op2);
1633	if (isSafeToContainOp2)
1634	{
1635	memOp = op2;
1636	}
1637	}
1638
1639	if ((memOp == nullptr) && (op1->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op1))
1640	{
1641	isSafeToContainOp1 = IsSafeToContainMem(node, op1);
1642	if (isSafeToContainOp1)
1643	{
1644	memOp = op1;
1645	}
1646	}
1647	}
1648	else
1649	{
1650	if ((memOp->TypeGet() != node->TypeGet()))
1651	{
1652	memOp = nullptr;
1653	}
1654	else if (!IsSafeToContainMem(node, memOp))
1655	{
1656	if (memOp == op1)
1657	{
1658	isSafeToContainOp1 = false;
1659	}
1660	else
1661	{
1662	isSafeToContainOp2 = false;
1663	}
1664	memOp = nullptr;
1665	}
1666	}
1667	// To generate an LEA we need to force memOp into a register
1668	// so don't allow memOp to be 'contained'
1669	//
1670	if (!useLeaEncoding)
1671	{
1672	if (memOp != nullptr)
1673	{
1674	MakeSrcContained(node, memOp);
1675	}
1676	else
1677	{
1678	// IsSafeToContainMem is expensive so we call it at most once for each operand
1679	// in this method. If we already called IsSafeToContainMem, it must have returned false;
1680	// otherwise, memOp would be set to the corresponding operand (op1 or op2).
1681	if (imm != nullptr)
1682	{
1683	// Has a contained immediate operand.
1684	// Only 'other' operand can be marked as reg optional.
1685	assert(other != nullptr);
1686
1687	isSafeToContainOp1 = ((other == op1) && isSafeToContainOp1 && IsSafeToContainMem(node, op1));
1688	isSafeToContainOp2 = ((other == op2) && isSafeToContainOp2 && IsSafeToContainMem(node, op2));
1689	}
1690	else if (hasImpliedFirstOperand)
1691	{
1692	// Only op2 can be marked as reg optional.
1693	isSafeToContainOp1 = false;
1694	isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1695	}
1696	else
1697	{
1698	// If there are no containable operands, we can make either of op1 or op2
1699	// as reg optional.
1700	isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
1701	isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1702	}
1703	SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
1704	}
1705	}
1706	}
1707
1708	//------------------------------------------------------------------------
1709	// ContainCheckDivOrMod: determine which operands of a div/mod should be contained.
1710	//
1711	// Arguments:
1712	// node - pointer to the node
1713	//
1714	void Lowering::ContainCheckDivOrMod(GenTreeOp* node)
1715	{
1716	assert(node->OperIs(GT_DIV, GT_MOD, GT_UDIV, GT_UMOD));
1717
1718	if (varTypeIsFloating(node->TypeGet()))
1719	{
1720	ContainCheckFloatBinary(node);
1721	return;
1722	}
1723
1724	GenTree* dividend = node->gtGetOp1();
1725	GenTree* divisor = node->gtGetOp2();
1726
1727	bool divisorCanBeRegOptional = true;
1728	#ifdef _TARGET_X86_
1729	if (dividend->OperGet() == GT_LONG)
1730	{
1731	divisorCanBeRegOptional = false;
1732	MakeSrcContained(node, dividend);
1733	}
1734	#endif
1735
1736	// divisor can be an r/m, but the memory indirection must be of the same size as the divide
1737	if (IsContainableMemoryOp(divisor) && (divisor->TypeGet() == node->TypeGet()))
1738	{
1739	MakeSrcContained(node, divisor);
1740	}
1741	else if (divisorCanBeRegOptional)
1742	{
1743	// If there are no containable operands, we can make an operand reg optional.
1744	// Div instruction allows only divisor to be a memory op.
1745	divisor->SetRegOptional();
1746	}
1747	}
1748
1749	//------------------------------------------------------------------------
1750	// ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
1751	//
1752	// Arguments:
1753	// node - pointer to the node
1754	//
1755	void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
1756	{
1757	assert(node->OperIsShiftOrRotate());
1758	#ifdef _TARGET_X86_
1759	GenTree* source = node->gtOp1;
1760	if (node->OperIsShiftLong())
1761	{
1762	assert(source->OperGet() == GT_LONG);
1763	MakeSrcContained(node, source);
1764	}
1765	#endif // !_TARGET_X86_
1766
1767	GenTree* shiftBy = node->gtOp2;
1768	if (IsContainableImmed(node, shiftBy) && (shiftBy->gtIntConCommon.IconValue() <= `255`) &&
1769	(shiftBy->gtIntConCommon.IconValue() >= `0`))
1770	{
1771	MakeSrcContained(node, shiftBy);
1772	}
1773	}
1774
1775	//------------------------------------------------------------------------
1776	// ContainCheckStoreLoc: determine whether the source of a STORE_LCL should be contained.*
1777	//
1778	// Arguments:
1779	// node - pointer to the node
1780	//
1781	void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc)
1782	{
1783	assert(storeLoc->OperIsLocalStore());
1784	GenTree* op1 = storeLoc->gtGetOp1();
1785
1786	#ifdef FEATURE_SIMD
1787	if (varTypeIsSIMD(storeLoc))
1788	{
1789	if (op1->IsCnsIntOrI())
1790	{
1791	// For an InitBlk we want op1 to be contained; otherwise we want it to
1792	// be evaluated into an xmm register.
1793	MakeSrcContained(storeLoc, op1);
1794	}
1795	return;
1796	}
1797	#endif // FEATURE_SIMD
1798
1799	// If the source is a containable immediate, make it contained, unless it is
1800	// an int-size or larger store of zero to memory, because we can generate smaller code
1801	// by zeroing a register and then storing it.
1802	if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(`0`) \|\| varTypeIsSmall(storeLoc)))
1803	{
1804	MakeSrcContained(storeLoc, op1);
1805	}
1806	#ifdef _TARGET_X86_
1807	else if (op1->OperGet() == GT_LONG)
1808	{
1809	MakeSrcContained(storeLoc, op1);
1810	}
1811	#endif // _TARGET_X86_
1812	}
1813
1814	//------------------------------------------------------------------------
1815	// ContainCheckCast: determine whether the source of a CAST node should be contained.
1816	//
1817	// Arguments:
1818	// node - pointer to the node
1819	//
1820	void Lowering::ContainCheckCast(GenTreeCast* node)
1821	{
1822	GenTree* castOp = node->CastOp();
1823	var_types castToType = node->CastToType();
1824	var_types srcType = castOp->TypeGet();
1825
1826	// force the srcType to unsigned if GT_UNSIGNED flag is set
1827	if (node->gtFlags & GTF_UNSIGNED)
1828	{
1829	srcType = genUnsignedType(srcType);
1830	}
1831
1832	if (!node->gtOverflow() && (varTypeIsFloating(castToType) \|\| varTypeIsFloating(srcType)))
1833	{
1834	#ifdef DEBUG
1835	// If converting to float/double, the operand must be 4 or 8 byte in size.
1836	if (varTypeIsFloating(castToType))
1837	{
1838	unsigned opSize = genTypeSize(srcType);
1839	assert(opSize == `4` \|\| opSize == `8`);
1840	}
1841	#endif // DEBUG
1842
1843	// U8 -> R8 conversion requires that the operand be in a register.
1844	if (srcType != TYP_ULONG)
1845	{
1846	if (IsContainableMemoryOp(castOp) \|\| castOp->IsCnsNonZeroFltOrDbl())
1847	{
1848	MakeSrcContained(node, castOp);
1849	}
1850	else
1851	{
1852	// Mark castOp as reg optional to indicate codegen
1853	// can still generate code if it is on stack.
1854	castOp->SetRegOptional();
1855	}
1856	}
1857	}
1858	#if !defined(_TARGET_64BIT_)
1859	if (varTypeIsLong(srcType))
1860	{
1861	noway_assert(castOp->OperGet() == GT_LONG);
1862	castOp->SetContained();
1863	}
1864	#endif // !defined(_TARGET_64BIT_)
1865	}
1866
1867	//------------------------------------------------------------------------
1868	// ContainCheckCompare: determine whether the sources of a compare node should be contained.
1869	//
1870	// Arguments:
1871	// node - pointer to the node
1872	//
1873	void Lowering::ContainCheckCompare(GenTreeOp* cmp)
1874	{
1875	assert(cmp->OperIsCompare() \|\| cmp->OperIs(GT_CMP));
1876
1877	GenTree* op1 = cmp->gtOp.gtOp1;
1878	GenTree* op2 = cmp->gtOp.gtOp2;
1879	var_types op1Type = op1->TypeGet();
1880	var_types op2Type = op2->TypeGet();
1881
1882	// If either of op1 or op2 is floating point values, then we need to use
1883	// ucomiss or ucomisd to compare, both of which support the following form:
1884	// ucomis[s\|d] xmm, xmm/mem
1885	// That is only the second operand can be a memory op.
1886	//
1887	// Second operand is a memory Op: Note that depending on comparison operator,
1888	// the operands of ucomis[s\|d] need to be reversed. Therefore, either op1 or
1889	// op2 can be a memory op depending on the comparison operator.
1890	if (varTypeIsFloating(op1Type))
1891	{
1892	// The type of the operands has to be the same and no implicit conversions at this stage.
1893	assert(op1Type == op2Type);
1894
1895	bool reverseOps;
1896	if ((cmp->gtFlags & GTF_RELOP_NAN_UN) != `0`)
1897	{
1898	// Unordered comparison case
1899	reverseOps = cmp->OperIs(GT_GT, GT_GE);
1900	}
1901	else
1902	{
1903	reverseOps = cmp->OperIs(GT_LT, GT_LE);
1904	}
1905
1906	GenTree* otherOp;
1907	if (reverseOps)
1908	{
1909	otherOp = op1;
1910	}
1911	else
1912	{
1913	otherOp = op2;
1914	}
1915
1916	assert(otherOp != nullptr);
1917	bool isSafeToContainOtherOp = true;
1918	if (otherOp->IsCnsNonZeroFltOrDbl())
1919	{
1920	MakeSrcContained(cmp, otherOp);
1921	}
1922	else if (IsContainableMemoryOp(otherOp))
1923	{
1924	isSafeToContainOtherOp = IsSafeToContainMem(cmp, otherOp);
1925	if (isSafeToContainOtherOp)
1926	{
1927	MakeSrcContained(cmp, otherOp);
1928	}
1929	}
1930
1931	if (!otherOp->isContained() && isSafeToContainOtherOp && IsSafeToContainMem(cmp, otherOp))
1932	{
1933	// SSE2 allows only otherOp to be a memory-op. Since otherOp is not
1934	// contained, we can mark it reg-optional.
1935	// IsSafeToContainMem is expensive so we call it at most once for otherOp.
1936	// If we already called IsSafeToContainMem, it must have returned false;
1937	// otherwise, otherOp would be contained.
1938	otherOp->SetRegOptional();
1939	}
1940
1941	return;
1942	}
1943
1944	// TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
1945	// or in other backend.
1946
1947	if (CheckImmedAndMakeContained(cmp, op2))
1948	{
1949	// If the types are the same, or if the constant is of the correct size,
1950	// we can treat the MemoryOp as contained.
1951	if (op1Type == op2Type)
1952	{
1953	if (IsContainableMemoryOp(op1))
1954	{
1955	MakeSrcContained(cmp, op1);
1956	}
1957	else
1958	{
1959	op1->SetRegOptional();
1960	}
1961	}
1962	}
1963	else if (op1Type == op2Type)
1964	{
1965	// Note that TEST does not have a r,rm encoding like CMP has but we can still
1966	// contain the second operand because the emitter maps both r,rm and rm,r to
1967	// the same instruction code. This avoids the need to special case TEST here.
1968
1969	bool isSafeToContainOp1 = true;
1970	bool isSafeToContainOp2 = true;
1971
1972	if (IsContainableMemoryOp(op2))
1973	{
1974	isSafeToContainOp2 = IsSafeToContainMem(cmp, op2);
1975	if (isSafeToContainOp2)
1976	{
1977	MakeSrcContained(cmp, op2);
1978	}
1979	}
1980
1981	if (!op2->isContained() && IsContainableMemoryOp(op1))
1982	{
1983	isSafeToContainOp1 = IsSafeToContainMem(cmp, op1);
1984	if (isSafeToContainOp1)
1985	{
1986	MakeSrcContained(cmp, op1);
1987	}
1988	}
1989
1990	if (!op1->isContained() && !op2->isContained())
1991	{
1992	// One of op1 or op2 could be marked as reg optional
1993	// to indicate that codegen can still generate code
1994	// if one of them is on stack.
1995	GenTree* regOptionalCandidate = op1->IsCnsIntOrI() ? op2 : PreferredRegOptionalOperand(cmp);
1996
1997	// IsSafeToContainMem is expensive so we call it at most once for each operand
1998	// in this method. If we already called IsSafeToContainMem, it must have returned false;
1999	// otherwise, the corresponding operand (op1 or op2) would be contained.
2000	bool setRegOptional = (regOptionalCandidate == op1) ? isSafeToContainOp1 && IsSafeToContainMem(cmp, op1)
2001	: isSafeToContainOp2 && IsSafeToContainMem(cmp, op2);
2002	if (setRegOptional)
2003	{
2004	regOptionalCandidate->SetRegOptional();
2005	}
2006	}
2007	}
2008	}
2009
2010	//------------------------------------------------------------------------
2011	// LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
2012	//
2013	// Arguments:
2014	// node - The indirect store node (GT_STORE_IND) of interest
2015	//
2016	// Return Value:
2017	// Returns true if 'node' is a valid RMW mem op; false otherwise.
2018	//
2019	bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
2020	{
2021	assert(storeInd->OperGet() == GT_STOREIND);
2022
2023	// SSE2 doesn't support RMW on float values
2024	assert(!varTypeIsFloating(storeInd));
2025
2026	// Terminology:
2027	// indirDst = memory write of an addr mode (i.e. storeind destination)
2028	// indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
2029	// indirCandidate = memory read i.e. a gtInd of an addr mode
2030	// indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
2031
2032	GenTree* indirCandidate = nullptr;
2033	GenTree* indirOpSource = nullptr;
2034
2035	if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
2036	{
2037	JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
2038	storeInd->AsStoreInd()->GetRMWStatus());
2039	DISPTREERANGE(BlockRange(), storeInd);
2040	return false;
2041	}
2042
2043	GenTree* indirDst = storeInd->gtGetOp1();
2044	GenTree* indirSrc = storeInd->gtGetOp2();
2045	genTreeOps oper = indirSrc->OperGet();
2046
2047	// At this point we have successfully detected a RMW memory op of one of the following forms
2048	// storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
2049	// storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
2050	// storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
2051	//
2052	// Here indirSrc = one of the supported binary or unary operation for RMW of memory
2053	// indirCandidate = a GT_IND node
2054	// indirCandidateChild = operand of GT_IND indirCandidate
2055	//
2056	// The logic below does the following
2057	// Make indirOpSource contained.
2058	// Make indirSrc contained.
2059	// Make indirCandidate contained.
2060	// Make indirCandidateChild contained.
2061	// Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
2062	// base.
2063	//
2064
2065	// We have already done containment analysis on the indirSrc op.
2066	// If any of its operands are marked regOptional, reset that now.
2067	indirSrc->AsOp()->gtOp1->ClearRegOptional();
2068	if (GenTree::OperIsBinary(oper))
2069	{
2070	// On Xarch RMW operations require the source to be an immediate or in a register.
2071	// Therefore, if we have previously marked the indirOpSource as contained while lowering
2072	// the binary node, we need to reset that now.
2073	if (IsContainableMemoryOp(indirOpSource))
2074	{
2075	indirOpSource->ClearContained();
2076	}
2077	indirSrc->AsOp()->gtOp2->ClearRegOptional();
2078	JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
2079	}
2080	else
2081	{
2082	assert(GenTree::OperIsUnary(oper));
2083	JITDUMP("Lower succesfully detected an assignment of the form: addrMode = UnaryOp(addrMode)\n");
2084	}
2085	DISPTREERANGE(BlockRange(), storeInd);
2086
2087	indirSrc->SetContained();
2088	indirCandidate->SetContained();
2089
2090	GenTree* indirCandidateChild = indirCandidate->gtGetOp1();
2091	indirCandidateChild->SetContained();
2092
2093	if (indirCandidateChild->OperGet() == GT_LEA)
2094	{
2095	GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
2096
2097	if (addrMode->HasBase())
2098	{
2099	assert(addrMode->Base()->OperIsLeaf());
2100	addrMode->Base()->SetContained();
2101	}
2102
2103	if (addrMode->HasIndex())
2104	{
2105	assert(addrMode->Index()->OperIsLeaf());
2106	addrMode->Index()->SetContained();
2107	}
2108
2109	indirDst->SetContained();
2110	}
2111	else
2112	{
2113	assert(indirCandidateChild->OperGet() == GT_LCL_VAR \|\| indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR \|\|
2114	indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR \|\| indirCandidateChild->OperGet() == GT_CNS_INT);
2115
2116	// If it is a GT_LCL_VAR, it still needs the reg to hold the address.
2117	// We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
2118	// For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
2119	// time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
2120	if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR \|\| indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
2121	{
2122	indirDst->SetContained();
2123	}
2124	else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
2125	{
2126	indirDst->SetContained();
2127	}
2128	}
2129	return true;
2130	}
2131
2132	//------------------------------------------------------------------------
2133	// ContainCheckBinary: Determine whether a binary op's operands should be contained.
2134	//
2135	// Arguments:
2136	// node - the node we care about
2137	//
2138	void Lowering::ContainCheckBinary(GenTreeOp* node)
2139	{
2140	assert(node->OperIsBinary());
2141
2142	if (varTypeIsFloating(node))
2143	{
2144	assert(node->OperIs(GT_ADD, GT_SUB));
2145	ContainCheckFloatBinary(node);
2146	return;
2147	}
2148
2149	GenTree* op1 = node->gtOp1;
2150	GenTree* op2 = node->gtOp2;
2151
2152	// We can directly encode the second operand if it is either a containable constant or a memory-op.
2153	// In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
2154	// This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
2155	// do not match, they get normalized (i.e. sign/zero extended) on load into a register.
2156	bool directlyEncodable = false;
2157	bool binOpInRMW = false;
2158	GenTree* operand = nullptr;
2159	bool isSafeToContainOp1 = true;
2160	bool isSafeToContainOp2 = true;
2161
2162	if (IsContainableImmed(node, op2))
2163	{
2164	directlyEncodable = true;
2165	operand = op2;
2166	}
2167	else
2168	{
2169	binOpInRMW = IsBinOpInRMWStoreInd(node);
2170	if (!binOpInRMW)
2171	{
2172	const unsigned operatorSize = genTypeSize(node->TypeGet());
2173	if ((genTypeSize(op2->TypeGet()) == operatorSize) && IsContainableMemoryOp(op2))
2174	{
2175	isSafeToContainOp2 = IsSafeToContainMem(node, op2);
2176	if (isSafeToContainOp2)
2177	{
2178	directlyEncodable = true;
2179	operand = op2;
2180	}
2181	}
2182
2183	if ((operand == nullptr) && node->OperIsCommutative())
2184	{
2185	// If it is safe, we can reverse the order of operands of commutative operations for efficient
2186	// codegen
2187	if (IsContainableImmed(node, op1))
2188	{
2189	directlyEncodable = true;
2190	operand = op1;
2191	}
2192	else if ((genTypeSize(op1->TypeGet()) == operatorSize) && IsContainableMemoryOp(op1))
2193	{
2194	isSafeToContainOp1 = IsSafeToContainMem(node, op1);
2195	if (isSafeToContainOp1)
2196	{
2197	directlyEncodable = true;
2198	operand = op1;
2199	}
2200	}
2201	}
2202	}
2203	}
2204
2205	if (directlyEncodable)
2206	{
2207	assert(operand != nullptr);
2208	MakeSrcContained(node, operand);
2209	}
2210	else if (!binOpInRMW)
2211	{
2212	// If this binary op neither has contained operands, nor is a
2213	// Read-Modify-Write (RMW) operation, we can mark its operands
2214	// as reg optional.
2215
2216	// IsSafeToContainMem is expensive so we call it at most once for each operand
2217	// in this method. If we already called IsSafeToContainMem, it must have returned false;
2218	// otherwise, directlyEncodable would be true.
2219	isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
2220	isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
2221
2222	SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
2223	}
2224	}
2225
2226	//------------------------------------------------------------------------
2227	// ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
2228	//
2229	// Arguments:
2230	// node - pointer to the node
2231	//
2232	void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
2233	{
2234	assert(node->OperIsBoundsCheck());
2235	GenTree* other;
2236	if (CheckImmedAndMakeContained(node, node->gtIndex))
2237	{
2238	other = node->gtArrLen;
2239	}
2240	else if (CheckImmedAndMakeContained(node, node->gtArrLen))
2241	{
2242	other = node->gtIndex;
2243	}
2244	else if (IsContainableMemoryOp(node->gtIndex))
2245	{
2246	other = node->gtIndex;
2247	}
2248	else
2249	{
2250	other = node->gtArrLen;
2251	}
2252
2253	if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
2254	{
2255	if (IsContainableMemoryOp(other))
2256	{
2257	MakeSrcContained(node, other);
2258	}
2259	else
2260	{
2261	// We can mark 'other' as reg optional, since it is not contained.
2262	other->SetRegOptional();
2263	}
2264	}
2265	}
2266
2267	//------------------------------------------------------------------------
2268	// ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
2269	//
2270	// Arguments:
2271	// node - pointer to the node
2272	//
2273	void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
2274	{
2275	assert(node->OperIs(GT_INTRINSIC));
2276
2277	CorInfoIntrinsics intrinsicId = node->gtIntrinsic.gtIntrinsicId;
2278
2279	if (intrinsicId == CORINFO_INTRINSIC_Sqrt \|\| intrinsicId == CORINFO_INTRINSIC_Round \|\|
2280	intrinsicId == CORINFO_INTRINSIC_Ceiling \|\| intrinsicId == CORINFO_INTRINSIC_Floor)
2281	{
2282	GenTree* op1 = node->gtGetOp1();
2283	if (IsContainableMemoryOp(op1) \|\| op1->IsCnsNonZeroFltOrDbl())
2284	{
2285	MakeSrcContained(node, op1);
2286	}
2287	else
2288	{
2289	// Mark the operand as reg optional since codegen can still
2290	// generate code if op1 is on stack.
2291	op1->SetRegOptional();
2292	}
2293	}
2294	}
2295
2296	#ifdef FEATURE_SIMD
2297	//----------------------------------------------------------------------------------------------
2298	// ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
2299	//
2300	// Arguments:
2301	// simdNode - The SIMD intrinsic node.
2302	//
2303	void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
2304	{
2305	switch (simdNode->gtSIMDIntrinsicID)
2306	{
2307	GenTree* op1;
2308	GenTree* op2;
2309
2310	case SIMDIntrinsicInit:
2311	{
2312	op1 = simdNode->gtOp.gtOp1;
2313	#ifndef _TARGET_64BIT_
2314	if (op1->OperGet() == GT_LONG)
2315	{
2316	MakeSrcContained(simdNode, op1);
2317	GenTree* op1lo = op1->gtGetOp1();
2318	GenTree* op1hi = op1->gtGetOp2();
2319
2320	if ((op1lo->IsIntegralConst(`0`) && op1hi->IsIntegralConst(`0`)) \|\|
2321	(op1lo->IsIntegralConst(-`1`) && op1hi->IsIntegralConst(-`1`)))
2322	{
2323	MakeSrcContained(op1, op1lo);
2324	MakeSrcContained(op1, op1hi);
2325	}
2326	}
2327	else
2328	#endif // !_TARGET_64BIT_
2329	if (op1->IsFPZero() \|\| op1->IsIntegralConst(`0`) \|\|
2330	(varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-`1`)))
2331	{
2332	MakeSrcContained(simdNode, op1);
2333	}
2334	else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
2335	((simdNode->gtSIMDSize == `16`) \|\| (simdNode->gtSIMDSize == `32`)))
2336	{
2337	// Either op1 is a float or dbl constant or an addr
2338	if (op1->IsCnsFltOrDbl() \|\| op1->OperIsLocalAddr())
2339	{
2340	MakeSrcContained(simdNode, op1);
2341	}
2342	}
2343	}
2344	break;
2345
2346	case SIMDIntrinsicInitArray:
2347	// We have an array and an index, which may be contained.
2348	CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
2349	break;
2350
2351	case SIMDIntrinsicOpEquality:
2352	case SIMDIntrinsicOpInEquality:
2353	// On SSE4/AVX, we can generate optimal code for (in)equality
2354	// against zero using ptest. We can safely do this optimization
2355	// for integral vectors but not for floating-point for the reason
2356	// that we have +0.0 and -0.0 and +0.0 == -0.0
2357	op2 = simdNode->gtGetOp2();
2358	if ((comp->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(`0`))
2359	{
2360	MakeSrcContained(simdNode, op2);
2361	}
2362	break;
2363
2364	case SIMDIntrinsicGetItem:
2365	{
2366	// This implements get_Item method. The sources are:
2367	// - the source SIMD struct
2368	// - index (which element to get)
2369	// The result is baseType of SIMD struct.
2370	op1 = simdNode->gtOp.gtOp1;
2371	op2 = simdNode->gtOp.gtOp2;
2372
2373	if (op1->OperGet() == GT_IND)
2374	{
2375	assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != `0`);
2376	op1->AsIndir()->Addr()->ClearContained();
2377	}
2378	// If the index is a constant, mark it as contained.
2379	CheckImmedAndMakeContained(simdNode, op2);
2380
2381	if (IsContainableMemoryOp(op1))
2382	{
2383	MakeSrcContained(simdNode, op1);
2384	if (op1->OperGet() == GT_IND)
2385	{
2386	op1->AsIndir()->Addr()->ClearContained();
2387	}
2388	}
2389	}
2390	break;
2391
2392	case SIMDIntrinsicShuffleSSE2:
2393	// Second operand is an integer constant and marked as contained.
2394	assert(simdNode->gtOp.gtOp2->IsCnsIntOrI());
2395	MakeSrcContained(simdNode, simdNode->gtOp.gtOp2);
2396	break;
2397
2398	default:
2399	break;
2400	}
2401	}
2402	#endif // FEATURE_SIMD
2403
2404	#ifdef FEATURE_HW_INTRINSICS
2405	//----------------------------------------------------------------------------------------------
2406	// IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
2407	//
2408	// Arguments:
2409	// containingNode - The hardware intrinsic node which contains 'node'
2410	// node - The node to check
2411	// [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands;
2412	// otherwise, false.
2413	//
2414	// Return Value:
2415	// true if 'node' is a containable hardware intrinsic node; otherwise, false.
2416	//
2417	bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node, bool* supportsRegOptional)
2418	{
2419	NamedIntrinsic containingIntrinsicId = containingNode->gtHWIntrinsicId;
2420	HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(containingIntrinsicId);
2421
2422	// We shouldn't have called in here if containingNode doesn't support containment
2423	assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId));
2424
2425	// containingNode supports nodes that read from an aligned memory address
2426	//
2427	// This will generally be an explicit LoadAligned instruction and is generally
2428	// false for machines with VEX support. This is because there is currently no way
2429	// to guarantee that the address read from will always be aligned and we could silently
2430	// change the behavior of the program in the case where an Access Violation would have
2431	// otherwise occurred.
2432	bool supportsAlignedSIMDLoads = false;
2433
2434	// containingNode supports nodes that read from general memory
2435	//
2436	// We currently have to assume all "general" loads are unaligned. As such, this is
2437	// generally used to determine if we can mark the node as `regOptional` in the case
2438	// where `node` is not containable. However, this can also be used to determine whether
2439	// we can mark other types of reads as contained (such as when directly reading a local).
2440	bool supportsGeneralLoads = false;
2441
2442	// containingNode supports nodes that read from a scalar memory address
2443	//
2444	// This will generally be an explicit LoadScalar instruction but is also used to determine
2445	// whether we can read an address of type T (we don't support this when the load would
2446	// read more than sizeof(T) bytes).
2447	bool supportsSIMDScalarLoads = false;
2448
2449	// containingNode supports nodes that read from an unaligned memory address
2450	//
2451	// This will generally be an explicit Load instruction and is generally false for machines
2452	// without VEX support. This is because older hardware required that the SIMD operand always
2453	// be aligned to the 'natural alignment' of the type.
2454	bool supportsUnalignedSIMDLoads = false;
2455
2456	switch (category)
2457	{
2458	case HW_Category_SimpleSIMD:
2459	{
2460	// These intrinsics only expect 16 or 32-byte nodes for containment
2461	assert((genTypeSize(node->TypeGet()) == `16`) \|\| (genTypeSize(node->TypeGet()) == `32`));
2462	assert(supportsSIMDScalarLoads == false);
2463
2464	supportsAlignedSIMDLoads =
2465	!comp->canUseVexEncoding() && (containingIntrinsicId != NI_SSE2_ConvertToVector128Double);
2466	supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2467	supportsGeneralLoads = supportsUnalignedSIMDLoads;
2468
2469	break;
2470	}
2471
2472	case HW_Category_IMM:
2473	{
2474	switch (containingIntrinsicId)
2475	{
2476	case NI_SSE_Shuffle:
2477	case NI_SSE2_CompareLessThan:
2478	case NI_SSE2_ShiftLeftLogical:
2479	case NI_SSE2_ShiftRightArithmetic:
2480	case NI_SSE2_ShiftRightLogical:
2481	case NI_SSE2_Shuffle:
2482	case NI_SSE2_ShuffleHigh:
2483	case NI_SSE2_ShuffleLow:
2484	case NI_SSSE3_AlignRight:
2485	case NI_SSE41_Blend:
2486	case NI_SSE41_DotProduct:
2487	case NI_SSE41_MultipleSumAbsoluteDifferences:
2488	case NI_AES_KeygenAssist:
2489	case NI_PCLMULQDQ_CarrylessMultiply:
2490	case NI_AVX_Blend:
2491	case NI_AVX_Compare:
2492	case NI_AVX_DotProduct:
2493	case NI_AVX_InsertVector128:
2494	case NI_AVX_Permute:
2495	case NI_AVX_Permute2x128:
2496	case NI_AVX2_Blend:
2497	case NI_AVX2_InsertVector128:
2498	case NI_AVX2_MultipleSumAbsoluteDifferences:
2499	case NI_AVX2_Permute2x128:
2500	case NI_AVX2_Permute4x64:
2501	case NI_AVX2_ShiftLeftLogical:
2502	case NI_AVX2_ShiftRightArithmetic:
2503	case NI_AVX2_ShiftRightLogical:
2504	case NI_AVX2_ShuffleHigh:
2505	case NI_AVX2_ShuffleLow:
2506	{
2507	// These intrinsics only expect 16 or 32-byte nodes for containment
2508	assert((genTypeSize(node->TypeGet()) == `16`) \|\| (genTypeSize(node->TypeGet()) == `32`));
2509	assert(supportsSIMDScalarLoads == false);
2510
2511	supportsAlignedSIMDLoads = !comp->canUseVexEncoding();
2512	supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2513	supportsGeneralLoads = supportsUnalignedSIMDLoads;
2514
2515	break;
2516	}
2517
2518	case NI_SSE2_Insert:
2519	case NI_SSE41_Insert:
2520	case NI_SSE41_X64_Insert:
2521	{
2522	if (containingNode->gtSIMDBaseType == TYP_FLOAT)
2523	{
2524	assert(containingIntrinsicId == NI_SSE41_Insert);
2525	assert(genTypeSize(node->TypeGet()) == `16`);
2526
2527	// Sse41.Insert(V128<float>, V128<float>, byte) is a bit special
2528	// in that it has different behavior depending on whether the
2529	// second operand is coming from a register or memory. When coming
2530	// from a register, all 4 elements of the vector can be used and it
2531	// is effectively a regular `SimpleSIMD` operation; but when loading
2532	// from memory, it only works with the lowest element and is effectively
2533	// a `SIMDScalar`.
2534
2535	assert(supportsAlignedSIMDLoads == false);
2536	assert(supportsUnalignedSIMDLoads == false);
2537	assert(supportsGeneralLoads == false);
2538	assert(supportsSIMDScalarLoads == false);
2539
2540	GenTree* op1 = containingNode->gtGetOp1();
2541	GenTree* op2 = nullptr;
2542	GenTree* op3 = nullptr;
2543
2544	assert(op1->OperIsList());
2545	assert(containingNode->gtGetOp2() == nullptr);
2546
2547	GenTreeArgList* argList = op1->AsArgList();
2548
2549	op1 = argList->Current();
2550	argList = argList->Rest();
2551
2552	op2 = argList->Current();
2553	argList = argList->Rest();
2554
2555	assert(node == op2);
2556
2557	op3 = argList->Current();
2558
2559	// The upper two bits of the immediate value are ignored if
2560	// op2 comes from memory. In order to support using the upper
2561	// bits, we need to disable containment support if op3 is not
2562	// constant or if the constant is greater than 0x3F (which means
2563	// at least one of the upper two bits is set).
2564
2565	if (op3->IsCnsIntOrI())
2566	{
2567	ssize_t ival = op3->AsIntCon()->IconValue();
2568	assert((ival >= `0`) && (ival <= `255`));
2569
2570	supportsSIMDScalarLoads = (ival <= `0x3F`);
2571	supportsGeneralLoads = supportsSIMDScalarLoads;
2572	}
2573	break;
2574	}
2575
2576	// We should only get here for integral nodes.
2577	assert(varTypeIsIntegral(node->TypeGet()));
2578
2579	assert(supportsAlignedSIMDLoads == false);
2580	assert(supportsUnalignedSIMDLoads == false);
2581	assert(supportsSIMDScalarLoads == false);
2582
2583	const unsigned expectedSize = genTypeSize(containingNode->gtSIMDBaseType);
2584	const unsigned operandSize = genTypeSize(node->TypeGet());
2585
2586	supportsGeneralLoads = (operandSize >= expectedSize);
2587	break;
2588	}
2589
2590	case NI_AVX_CompareScalar:
2591	{
2592	// These intrinsics only expect 16 or 32-byte nodes for containment
2593	assert((genTypeSize(node->TypeGet()) == `16`) \|\| (genTypeSize(node->TypeGet()) == `32`));
2594
2595	assert(supportsAlignedSIMDLoads == false);
2596	assert(supportsUnalignedSIMDLoads == false);
2597
2598	supportsSIMDScalarLoads = true;
2599	supportsGeneralLoads = supportsSIMDScalarLoads;
2600	break;
2601	}
2602
2603	default:
2604	{
2605	assert(supportsAlignedSIMDLoads == false);
2606	assert(supportsGeneralLoads == false);
2607	assert(supportsSIMDScalarLoads == false);
2608	assert(supportsUnalignedSIMDLoads == false);
2609	break;
2610	}
2611	}
2612	break;
2613	}
2614
2615	case HW_Category_SIMDScalar:
2616	{
2617	assert(supportsAlignedSIMDLoads == false);
2618	assert(supportsUnalignedSIMDLoads == false);
2619
2620	switch (containingIntrinsicId)
2621	{
2622	case NI_Base_Vector128_CreateScalarUnsafe:
2623	case NI_Base_Vector256_CreateScalarUnsafe:
2624	{
2625	assert(supportsSIMDScalarLoads == false);
2626
2627	const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2628	const unsigned operandSize = genTypeSize(node->TypeGet());
2629
2630	supportsGeneralLoads = (operandSize == expectedSize);
2631	break;
2632	}
2633
2634	case NI_SSE_ConvertScalarToVector128Single:
2635	case NI_SSE2_ConvertScalarToVector128Double:
2636	case NI_SSE2_ConvertScalarToVector128Int32:
2637	case NI_SSE2_ConvertScalarToVector128UInt32:
2638	case NI_SSE_X64_ConvertScalarToVector128Single:
2639	case NI_SSE2_X64_ConvertScalarToVector128Double:
2640	case NI_SSE2_X64_ConvertScalarToVector128Int64:
2641	case NI_SSE2_X64_ConvertScalarToVector128UInt64:
2642	{
2643	if (!varTypeIsIntegral(node->TypeGet()))
2644	{
2645	// The floating-point overload doesn't require any special semantics
2646	assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double);
2647	supportsSIMDScalarLoads = true;
2648	supportsGeneralLoads = supportsSIMDScalarLoads;
2649	break;
2650	}
2651
2652	assert(supportsSIMDScalarLoads == false);
2653
2654	const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2655	const unsigned operandSize = genTypeSize(node->TypeGet());
2656
2657	supportsGeneralLoads = (operandSize == expectedSize);
2658	break;
2659	}
2660
2661	default:
2662	{
2663	// These intrinsics only expect 16 or 32-byte nodes for containment
2664	assert((genTypeSize(node->TypeGet()) == `16`) \|\| (genTypeSize(node->TypeGet()) == `32`));
2665
2666	supportsSIMDScalarLoads = true;
2667	supportsGeneralLoads = supportsSIMDScalarLoads;
2668	break;
2669	}
2670	}
2671	break;
2672	}
2673
2674	case HW_Category_Scalar:
2675	{
2676	// We should only get here for integral nodes.
2677	assert(varTypeIsIntegral(node->TypeGet()));
2678
2679	assert(supportsAlignedSIMDLoads == false);
2680	assert(supportsUnalignedSIMDLoads == false);
2681	assert(supportsSIMDScalarLoads == false);
2682
2683	const unsigned expectedSize = genTypeSize(containingNode->TypeGet());
2684	const unsigned operandSize = genTypeSize(node->TypeGet());
2685
2686	supportsGeneralLoads = (operandSize >= expectedSize);
2687	break;
2688	}
2689
2690	default:
2691	{
2692	assert(supportsAlignedSIMDLoads == false);
2693	assert(supportsGeneralLoads == false);
2694	assert(supportsSIMDScalarLoads == false);
2695	assert(supportsUnalignedSIMDLoads == false);
2696	break;
2697	}
2698	}
2699
2700	noway_assert(supportsRegOptional != nullptr);
2701	*supportsRegOptional = supportsGeneralLoads;
2702
2703	if (!node->OperIsHWIntrinsic())
2704	{
2705	return supportsGeneralLoads && IsContainableMemoryOp(node);
2706	}
2707
2708	// TODO-XArch: Update this to be table driven, if possible.
2709
2710	NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->gtHWIntrinsicId;
2711
2712	switch (intrinsicId)
2713	{
2714	case NI_SSE_LoadAlignedVector128:
2715	case NI_SSE2_LoadAlignedVector128:
2716	case NI_AVX_LoadAlignedVector256:
2717	{
2718	return supportsAlignedSIMDLoads;
2719	}
2720
2721	case NI_SSE_LoadScalarVector128:
2722	case NI_SSE2_LoadScalarVector128:
2723	{
2724	return supportsSIMDScalarLoads;
2725	}
2726
2727	// VEX encoding supports unaligned memory ops, so we can fold them
2728	case NI_SSE_LoadVector128:
2729	case NI_SSE2_LoadVector128:
2730	case NI_AVX_LoadVector256:
2731	{
2732	return supportsUnalignedSIMDLoads;
2733	}
2734
2735	default:
2736	{
2737	assert(!node->isContainableHWIntrinsic());
2738	return false;
2739	}
2740	}
2741	}
2742
2743	//----------------------------------------------------------------------------------------------
2744	// ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
2745	//
2746	// Arguments:
2747	// node - The hardware intrinsic node.
2748	//
2749	void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
2750	{
2751	NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2752	HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
2753	int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2754	var_types baseType = node->gtSIMDBaseType;
2755
2756	GenTree* op1 = node->gtGetOp1();
2757	GenTree* op2 = node->gtGetOp2();
2758	GenTree* op3 = nullptr;
2759
2760	if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
2761	{
2762	// AVX2 gather are not contaibable and always have constant IMM argument
2763	if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
2764	{
2765	GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
2766	assert(lastOp != nullptr);
2767	MakeSrcContained(node, lastOp);
2768	}
2769	// Exit early if containment isn't supported
2770	return;
2771	}
2772
2773	// TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
2774
2775	const bool isCommutative = HWIntrinsicInfo::IsCommutative(intrinsicId);
2776
2777	if (numArgs == `1`)
2778	{
2779	// One argument intrinsics cannot be commutative
2780	assert(!isCommutative);
2781
2782	assert(!op1->OperIsList());
2783	assert(op2 == nullptr);
2784
2785	switch (category)
2786	{
2787	case HW_Category_SimpleSIMD:
2788	case HW_Category_SIMDScalar:
2789	case HW_Category_Scalar:
2790	{
2791	switch (intrinsicId)
2792	{
2793	case NI_SSE_ReciprocalScalar:
2794	case NI_SSE_ReciprocalSqrtScalar:
2795	case NI_SSE_SqrtScalar:
2796	case NI_SSE2_SqrtScalar:
2797	case NI_SSE41_CeilingScalar:
2798	case NI_SSE41_FloorScalar:
2799	case NI_SSE41_RoundCurrentDirectionScalar:
2800	case NI_SSE41_RoundToNearestIntegerScalar:
2801	case NI_SSE41_RoundToNegativeInfinityScalar:
2802	case NI_SSE41_RoundToPositiveInfinityScalar:
2803	case NI_SSE41_RoundToZeroScalar:
2804	{
2805	// These intrinsics have both 1 and 2-operand overloads.
2806	//
2807	// The 1-operand overload basically does `intrinsic(op1, op1)`
2808	//
2809	// Because of this, the operand must be loaded into a register
2810	// and cannot be contained.
2811	return;
2812	}
2813
2814	case NI_SSE2_ConvertToInt32:
2815	case NI_SSE2_X64_ConvertToInt64:
2816	case NI_SSE2_ConvertToUInt32:
2817	case NI_SSE2_X64_ConvertToUInt64:
2818	case NI_AVX2_ConvertToInt32:
2819	case NI_AVX2_ConvertToUInt32:
2820	{
2821	if (varTypeIsIntegral(baseType))
2822	{
2823	// These intrinsics are "ins reg/mem, xmm" and don't
2824	// currently support containment.
2825	return;
2826	}
2827
2828	break;
2829	}
2830
2831	default:
2832	{
2833	break;
2834	}
2835	}
2836
2837	bool supportsRegOptional = false;
2838
2839	if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2840	{
2841	MakeSrcContained(node, op1);
2842	}
2843	else if (supportsRegOptional)
2844	{
2845	op1->SetRegOptional();
2846	}
2847	break;
2848	}
2849
2850	default:
2851	{
2852	unreached();
2853	break;
2854	}
2855	}
2856	}
2857	else
2858	{
2859	if (numArgs == `2`)
2860	{
2861	assert(!op1->OperIsList());
2862	assert(op2 != nullptr);
2863	assert(!op2->OperIsList());
2864
2865	switch (category)
2866	{
2867	case HW_Category_SimpleSIMD:
2868	case HW_Category_SIMDScalar:
2869	case HW_Category_Scalar:
2870	{
2871	if (HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId))
2872	{
2873	switch (intrinsicId)
2874	{
2875	case NI_SSE_CompareLessThanOrderedScalar:
2876	case NI_SSE_CompareLessThanUnorderedScalar:
2877	case NI_SSE_CompareLessThanOrEqualOrderedScalar:
2878	case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
2879	case NI_SSE2_CompareLessThanOrderedScalar:
2880	case NI_SSE2_CompareLessThanUnorderedScalar:
2881	case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
2882	case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
2883	{
2884	// We need to swap the operands for CompareLessThanOrEqual
2885	node->gtOp1 = op2;
2886	node->gtOp2 = op1;
2887	op2 = op1;
2888	break;
2889	}
2890
2891	default:
2892	{
2893	// TODO-XArch-CQ: The CompareOrderedScalar and CompareUnorderedScalar methods
2894	// are commutative if you also inverse the intrinsic.
2895	break;
2896	}
2897	}
2898	}
2899
2900	bool supportsRegOptional = false;
2901
2902	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2903	{
2904	MakeSrcContained(node, op2);
2905	}
2906	else if ((isCommutative \|\| (intrinsicId == NI_BMI2_MultiplyNoFlags) \|\|
2907	(intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) &&
2908	IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2909	{
2910	MakeSrcContained(node, op1);
2911
2912	// Swap the operands here to make the containment checks in codegen significantly simpler
2913	node->gtOp1 = op2;
2914	node->gtOp2 = op1;
2915	}
2916	else if (supportsRegOptional)
2917	{
2918	op2->SetRegOptional();
2919
2920	// TODO-XArch-CQ: For commutative nodes, either operand can be reg-optional.
2921	// https://github.com/dotnet/coreclr/issues/6361
2922	}
2923	break;
2924	}
2925
2926	case HW_Category_IMM:
2927	{
2928	// We don't currently have any IMM intrinsics which are also commutative
2929	assert(!isCommutative);
2930	bool supportsRegOptional = false;
2931
2932	switch (intrinsicId)
2933	{
2934	case NI_SSE2_ShiftLeftLogical:
2935	case NI_SSE2_ShiftRightArithmetic:
2936	case NI_SSE2_ShiftRightLogical:
2937	case NI_AVX2_ShiftLeftLogical:
2938	case NI_AVX2_ShiftRightArithmetic:
2939	case NI_AVX2_ShiftRightLogical:
2940	{
2941	// These intrinsics can have op2 be imm or reg/mem
2942
2943	if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2))
2944	{
2945	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2946	{
2947	MakeSrcContained(node, op2);
2948	}
2949	else if (supportsRegOptional)
2950	{
2951	op2->SetRegOptional();
2952	}
2953	}
2954	break;
2955	}
2956
2957	case NI_SSE2_Shuffle:
2958	case NI_SSE2_ShuffleHigh:
2959	case NI_SSE2_ShuffleLow:
2960	case NI_AVX2_Permute4x64:
2961	{
2962	// These intrinsics have op2 as an imm and op1 as a reg/mem
2963
2964	if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2965	{
2966	MakeSrcContained(node, op1);
2967	}
2968	else if (supportsRegOptional)
2969	{
2970	op1->SetRegOptional();
2971	}
2972	break;
2973	}
2974
2975	case NI_AVX_Permute:
2976	{
2977	// These intrinsics can have op2 be imm or reg/mem
2978	// They also can have op1 be reg/mem and op2 be imm
2979
2980	if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
2981	{
2982	if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2983	{
2984	MakeSrcContained(node, op1);
2985	}
2986	else if (supportsRegOptional)
2987	{
2988	op1->SetRegOptional();
2989	}
2990	}
2991	else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2992	{
2993	MakeSrcContained(node, op2);
2994	}
2995	else if (supportsRegOptional)
2996	{
2997	op2->SetRegOptional();
2998	}
2999	break;
3000	}
3001
3002	case NI_AES_KeygenAssist:
3003	{
3004	if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3005	{
3006	MakeSrcContained(node, op1);
3007	}
3008	else if (supportsRegOptional)
3009	{
3010	op1->SetRegOptional();
3011	}
3012	break;
3013	}
3014
3015	default:
3016	{
3017	break;
3018	}
3019	}
3020
3021	break;
3022	}
3023
3024	case HW_Category_Special:
3025	{
3026	if (intrinsicId == NI_SSE2_CompareLessThan)
3027	{
3028	bool supportsRegOptional = false;
3029
3030	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3031	{
3032	MakeSrcContained(node, op2);
3033	}
3034	else if (supportsRegOptional)
3035	{
3036	op2->SetRegOptional();
3037	}
3038	}
3039	else
3040	{
3041	unreached();
3042	}
3043	break;
3044	}
3045
3046	default:
3047	{
3048	unreached();
3049	break;
3050	}
3051	}
3052	}
3053	else if (numArgs == `3`)
3054	{
3055	// three argument intrinsics should not be marked commutative
3056	assert(!isCommutative);
3057
3058	assert(op1->OperIsList());
3059	assert(op2 == nullptr);
3060
3061	GenTreeArgList* argList = op1->AsArgList();
3062	GenTreeArgList* originalArgList = argList;
3063
3064	op1 = argList->Current();
3065	argList = argList->Rest();
3066
3067	op2 = argList->Current();
3068	argList = argList->Rest();
3069
3070	op3 = argList->Current();
3071	assert(argList->Rest() == nullptr);
3072
3073	switch (category)
3074	{
3075	case HW_Category_SimpleSIMD:
3076	case HW_Category_SIMDScalar:
3077	case HW_Category_Scalar:
3078	{
3079	if ((intrinsicId >= NI_FMA_MultiplyAdd) && (intrinsicId <= NI_FMA_MultiplySubtractNegatedScalar))
3080	{
3081	bool supportsRegOptional = false;
3082
3083	if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional))
3084	{
3085	// 213 form: op1 = (op2 op1) + [op3]*
3086	MakeSrcContained(node, op3);
3087	}
3088	else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3089	{
3090	// 132 form: op1 = (op1 op3) + [op2]*
3091	MakeSrcContained(node, op2);
3092	}
3093	else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3094	{
3095	// Intrinsics with CopyUpperBits semantics cannot have op1 be contained
3096
3097	if (!HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
3098	{
3099	// 231 form: op3 = (op2 op3) + [op1]*
3100	MakeSrcContained(node, op1);
3101	}
3102	}
3103	else
3104	{
3105	assert(supportsRegOptional);
3106
3107	// TODO-XArch-CQ: Technically any one of the three operands can
3108	// be reg-optional. With a limitation on op1 where
3109	// it can only be so if CopyUpperBits is off.
3110	// https://github.com/dotnet/coreclr/issues/6361
3111
3112	// 213 form: op1 = (op2 op1) + op3*
3113	op3->SetRegOptional();
3114	}
3115	}
3116	else
3117	{
3118	bool supportsRegOptional = false;
3119
3120	switch (intrinsicId)
3121	{
3122	case NI_SSE41_BlendVariable:
3123	case NI_AVX_BlendVariable:
3124	case NI_AVX2_BlendVariable:
3125	{
3126	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3127	{
3128	MakeSrcContained(node, op2);
3129	}
3130	else if (supportsRegOptional)
3131	{
3132	op2->SetRegOptional();
3133	}
3134	break;
3135	}
3136
3137	case NI_BMI2_MultiplyNoFlags:
3138	case NI_BMI2_X64_MultiplyNoFlags:
3139	{
3140	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3141	{
3142	MakeSrcContained(node, op2);
3143	}
3144	else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3145	{
3146	MakeSrcContained(node, op1);
3147	// MultiplyNoFlags is a Commutative operation, so swap the first two operands here
3148	// to make the containment checks in codegen significantly simpler
3149	*(originalArgList->pCurrent()) = op2;
3150	*(originalArgList->Rest()->pCurrent()) = op1;
3151	}
3152	else if (supportsRegOptional)
3153	{
3154	op2->SetRegOptional();
3155	}
3156	break;
3157	}
3158
3159	default:
3160	{
3161	unreached();
3162	break;
3163	}
3164	}
3165	}
3166	}
3167
3168	case HW_Category_IMM:
3169	{
3170	bool supportsRegOptional = false;
3171
3172	switch (intrinsicId)
3173	{
3174	case NI_SSE_Shuffle:
3175	case NI_SSE2_Insert:
3176	case NI_SSE2_Shuffle:
3177	case NI_SSSE3_AlignRight:
3178	case NI_SSE41_Blend:
3179	case NI_SSE41_DotProduct:
3180	case NI_SSE41_Insert:
3181	case NI_SSE41_X64_Insert:
3182	case NI_SSE41_MultipleSumAbsoluteDifferences:
3183	case NI_AVX_Blend:
3184	case NI_AVX_Compare:
3185	case NI_AVX_CompareScalar:
3186	case NI_AVX_DotProduct:
3187	case NI_AVX_Permute2x128:
3188	case NI_AVX_Shuffle:
3189	case NI_AVX2_Blend:
3190	case NI_AVX2_MultipleSumAbsoluteDifferences:
3191	case NI_AVX2_Permute2x128:
3192	case NI_PCLMULQDQ_CarrylessMultiply:
3193	{
3194	if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3195	{
3196	MakeSrcContained(node, op2);
3197	}
3198	else if (supportsRegOptional)
3199	{
3200	op2->SetRegOptional();
3201	}
3202	break;
3203	}
3204
3205	default:
3206	{
3207	break;
3208	}
3209	}
3210
3211	break;
3212	}
3213
3214	default:
3215	{
3216	unreached();
3217	break;
3218	}
3219	}
3220	}
3221	else
3222	{
3223	unreached();
3224	}
3225
3226	if (HWIntrinsicInfo::lookupCategory(intrinsicId) == HW_Category_IMM)
3227	{
3228	GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
3229	assert(lastOp != nullptr);
3230
3231	if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
3232	{
3233	MakeSrcContained(node, lastOp);
3234	}
3235	}
3236	}
3237	}
3238	#endif // FEATURE_HW_INTRINSICS
3239
3240	//------------------------------------------------------------------------
3241	// ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
3242	//
3243	// Arguments:
3244	// node - pointer to the node
3245	//
3246	void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
3247	{
3248	assert(node->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV) && varTypeIsFloating(node));
3249
3250	// overflow operations aren't supported on float/double types.
3251	assert(!node->gtOverflowEx());
3252
3253	GenTree* op1 = node->gtGetOp1();
3254	GenTree* op2 = node->gtGetOp2();
3255
3256	// No implicit conversions at this stage as the expectation is that
3257	// everything is made explicit by adding casts.
3258	assert(op1->TypeGet() == op2->TypeGet());
3259
3260	bool isSafeToContainOp1 = true;
3261	bool isSafeToContainOp2 = true;
3262
3263	if (op2->IsCnsNonZeroFltOrDbl())
3264	{
3265	MakeSrcContained(node, op2);
3266	}
3267	else if (IsContainableMemoryOp(op2))
3268	{
3269	isSafeToContainOp2 = IsSafeToContainMem(node, op2);
3270	if (isSafeToContainOp2)
3271	{
3272	MakeSrcContained(node, op2);
3273	}
3274	}
3275
3276	if (!op2->isContained() && node->OperIsCommutative())
3277	{
3278	// Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
3279	// as long as it is safe so that the following efficient code sequence is generated:
3280	// addss/sd targetReg, memOp (if op1Reg == targetReg) OR
3281	// movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
3282	//
3283	// Instead of
3284	// movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
3285	// movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
3286
3287	if (op1->IsCnsNonZeroFltOrDbl())
3288	{
3289	MakeSrcContained(node, op1);
3290	}
3291	else if (IsContainableMemoryOp(op1))
3292	{
3293	isSafeToContainOp1 = IsSafeToContainMem(node, op1);
3294	if (isSafeToContainOp1)
3295	{
3296	MakeSrcContained(node, op1);
3297	}
3298	}
3299	}
3300
3301	if (!op1->isContained() && !op2->isContained())
3302	{
3303	// If there are no containable operands, we can make an operand reg optional.
3304	// IsSafeToContainMem is expensive so we call it at most once for each operand
3305	// in this method. If we already called IsSafeToContainMem, it must have returned false;
3306	// otherwise, the corresponding operand (op1 or op2) would be contained.
3307	isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
3308	isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
3309	SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
3310	}
3311	}
3312
3313	#endif // _TARGET_XARCH_
3314

Browse the source code of CoreCLR/jit/lowerxarch.cpp