codegenxarch.cpp source code [CoreCLR/jit/codegenxarch.cpp]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4
5	/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*
6	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7	XX XX
8	XX Amd64/x86 Code Generator XX
9	XX XX
10	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12	*/
13	#include "jitpch.h"
14	#ifdef _MSC_VER
15	#pragma hdrstop
16	#endif
17
18	#ifdef _TARGET_XARCH_
19	#include "emit.h"
20	#include "codegen.h"
21	#include "lower.h"
22	#include "gcinfo.h"
23	#include "gcinfoencoder.h"
24
25	/*****************************************************************************
26	*
27	* Generate code that will set the given register to the integer constant.
28	*/
29
30	void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
31	{
32	// Reg cannot be a FP reg
33	assert(!genIsValidFloatReg(reg));
34
35	// The only TYP_REF constant that can come this path is a managed 'null' since it is not
36	// relocatable. Other ref type constants (e.g. string objects) go through a different
37	// code path.
38	noway_assert(type != TYP_REF \|\| val == `0`);
39
40	if (val == `0`)
41	{
42	instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
43	}
44	else
45	{
46	// TODO-XArch-CQ: needs all the optimized cases
47	getEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
48	}
49	}
50
51	/*****************************************************************************
52	*
53	* Generate code to check that the GS cookie wasn't thrashed by a buffer
54	* overrun. If pushReg is true, preserve all registers around code sequence.
55	* Otherwise ECX could be modified.
56	*
57	* Implementation Note: pushReg = true, in case of tail calls.
58	*/
59	void CodeGen::genEmitGSCookieCheck(bool pushReg)
60	{
61	noway_assert(compiler->gsGlobalSecurityCookieAddr \|\| compiler->gsGlobalSecurityCookieVal);
62
63	// Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
64	// executing GS cookie check will not collect the object pointed to by EAX.
65	//
66	// For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
67	// In such case make sure that the correct GC-ness of RDX is reported as well, so
68	// a GC object pointed by RDX will not be collected.
69	if (!pushReg)
70	{
71	// Handle multi-reg return type values
72	if (compiler->compMethodReturnsMultiRegRetType())
73	{
74	ReturnTypeDesc retTypeDesc;
75	if (varTypeIsLong(compiler->info.compRetNativeType))
76	{
77	retTypeDesc.InitializeLongReturnType(compiler);
78	}
79	else // we must have a struct return type
80	{
81	retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
82	}
83
84	unsigned regCount = retTypeDesc.GetReturnRegCount();
85
86	// Only x86 and x64 Unix ABI allows multi-reg return and
87	// number of result regs should be equal to MAX_RET_REG_COUNT.
88	assert(regCount == MAX_RET_REG_COUNT);
89
90	for (unsigned i = `0`; i < regCount; ++i)
91	{
92	gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
93	}
94	}
95	else if (compiler->compMethodReturnsRetBufAddr())
96	{
97	// This is for returning in an implicit RetBuf.
98	// If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
99
100	// In case the return is in an implicit RetBuf, the native return type should be a struct
101	assert(varTypeIsStruct(compiler->info.compRetNativeType));
102
103	gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
104	}
105	// ... all other cases.
106	else
107	{
108	#ifdef _TARGET_AMD64_
109	// For x64, structs that are not returned in registers are always
110	// returned in implicit RetBuf. If we reached here, we should not have
111	// a RetBuf and the return type should not be a struct.
112	assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
113	assert(!varTypeIsStruct(compiler->info.compRetNativeType));
114	#endif // _TARGET_AMD64_
115
116	// For x86 Windows we can't make such assertions since we generate code for returning of
117	// the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
118	// compRetNativeType could be TYP_STRUCT.
119	gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
120	}
121	}
122
123	regNumber regGSCheck;
124	regMaskTP regMaskGSCheck = RBM_NONE;
125
126	if (!pushReg)
127	{
128	// Non-tail call: we can use any callee trash register that is not
129	// a return register or contain 'this' pointer (keep alive this), since
130	// we are generating GS cookie check after a GT_RETURN block.
131	// Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
132	// as return register for two-register-returned structs.
133	if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
134	(compiler->lvaTable[compiler->info.compThisArg].lvRegNum == REG_ARG_0))
135	{
136	regGSCheck = REG_ARG_1;
137	}
138	else
139	{
140	regGSCheck = REG_ARG_0;
141	}
142	}
143	else
144	{
145	#ifdef _TARGET_X86_
146	// It doesn't matter which register we pick, since we're going to save and restore it
147	// around the check.
148	// TODO-CQ: Can we optimize the choice of register to avoid doing the push/pop sometimes?
149	regGSCheck = REG_EAX;
150	regMaskGSCheck = RBM_EAX;
151	#else // !_TARGET_X86_
152	// Tail calls from methods that need GS check: We need to preserve registers while
153	// emitting GS cookie check for a tail prefixed call or a jmp. To emit GS cookie
154	// check, we might need a register. This won't be an issue for jmp calls for the
155	// reason mentioned below (see comment starting with "Jmp Calls:").
156	//
157	// The following are the possible solutions in case of tail prefixed calls:
158	// 1) Use R11 - ignore tail prefix on calls that need to pass a param in R11 when
159	// present in methods that require GS cookie check. Rest of the tail calls that
160	// do not require R11 will be honored.
161	// 2) Internal register - GT_CALL node reserves an internal register and emits GS
162	// cookie check as part of tail call codegen. GenExitCode() needs to special case
163	// fast tail calls implemented as epilog+jmp or such tail calls should always get
164	// dispatched via helper.
165	// 3) Materialize GS cookie check as a separate node hanging off GT_CALL node in
166	// right execution order during rationalization.
167	//
168	// There are two calls that use R11: VSD and calli pinvokes with cookie param. Tail
169	// prefix on pinvokes is ignored. That is, options 2 and 3 will allow tail prefixed
170	// VSD calls from methods that need GS check.
171	//
172	// Tail prefixed calls: Right now for Jit64 compat, method requiring GS cookie check
173	// ignores tail prefix. In future, if we intend to support tail calls from such a method,
174	// consider one of the options mentioned above. For now adding an assert that we don't
175	// expect to see a tail call in a method that requires GS check.
176	noway_assert(!compiler->compTailCallUsed);
177
178	// Jmp calls: specify method handle using which JIT queries VM for its entry point
179	// address and hence it can neither be a VSD call nor PInvoke calli with cookie
180	// parameter. Therefore, in case of jmp calls it is safe to use R11.
181	regGSCheck = REG_R11;
182	#endif // !_TARGET_X86_
183	}
184
185	regMaskTP byrefPushedRegs = RBM_NONE;
186	regMaskTP norefPushedRegs = RBM_NONE;
187	regMaskTP pushedRegs = RBM_NONE;
188
189	if (compiler->gsGlobalSecurityCookieAddr == nullptr)
190	{
191	#if defined(_TARGET_AMD64_)
192	// If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
193	// Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
194	if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
195	{
196	genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
197	getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, `0`);
198	}
199	else
200	#endif // defined(_TARGET_AMD64_)
201	{
202	assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
203	getEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, `0`,
204	(int)compiler->gsGlobalSecurityCookieVal);
205	}
206	}
207	else
208	{
209	// Ngen case - GS cookie value needs to be accessed through an indirection.
210
211	pushedRegs = genPushRegs(regMaskGSCheck, &byrefPushedRegs, &norefPushedRegs);
212
213	instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
214	getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, `0`);
215	getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, `0`);
216	}
217
218	BasicBlock* gsCheckBlk = genCreateTempLabel();
219	emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
220	inst_JMP(jmpEqual, gsCheckBlk);
221	genEmitHelperCall(CORINFO_HELP_FAIL_FAST, `0`, EA_UNKNOWN);
222	genDefineTempLabel(gsCheckBlk);
223
224	genPopRegs(pushedRegs, byrefPushedRegs, norefPushedRegs);
225	}
226
227	BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
228	{
229	#if FEATURE_EH_FUNCLETS
230	// Generate a call to the finally, like this:
231	// mov rcx,qword ptr [rbp + 20H] // Load rcx with PSPSym
232	// call finally-funclet
233	// jmp finally-return // Only for non-retless finally calls
234	// The jmp can be a NOP if we're going to the next block.
235	// If we're generating code for the main function (not a funclet), and there is no localloc,
236	// then RSP at this point is the same value as that stored in the PSPSym. So just copy RSP
237	// instead of loading the PSPSym in this case, or if PSPSym is not used (CoreRT ABI).
238
239	if ((compiler->lvaPSPSym == BAD_VAR_NUM) \|\|
240	(!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT)))
241	{
242	#ifndef UNIX_X86_ABI
243	inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
244	#endif // !UNIX_X86_ABI
245	}
246	else
247	{
248	getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, `0`);
249	}
250	getEmitter()->emitIns_J(INS_call, block->bbJumpDest);
251
252	if (block->bbFlags & BBF_RETLESS_CALL)
253	{
254	// We have a retless call, and the last instruction generated was a call.
255	// If the next block is in a different EH region (or is the end of the code
256	// block), then we need to generate a breakpoint here (since it will never
257	// get executed) to get proper unwind behavior.
258
259	if ((block->bbNext == nullptr) \|\| !BasicBlock::sameEHRegion(block, block->bbNext))
260	{
261	instGen(INS_BREAKPOINT); // This should never get executed
262	}
263	}
264	else
265	{
266	// TODO-Linux-x86: Do we need to handle the GC information for this NOP or JMP specially, as is done for other
267	// architectures?
268	#ifndef JIT32_GCENCODER
269	// Because of the way the flowgraph is connected, the liveness info for this one instruction
270	// after the call is not (can not be) correct in cases where a variable has a last use in the
271	// handler. So turn off GC reporting for this single instruction.
272	getEmitter()->emitDisableGC();
273	#endif // JIT32_GCENCODER
274
275	// Now go to where the finally funclet needs to return to.
276	if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
277	{
278	// Fall-through.
279	// TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
280	// to the next instruction? This would depend on stack walking from within the finally
281	// handler working without this instruction being in this special EH region.
282	instGen(INS_nop);
283	}
284	else
285	{
286	inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
287	}
288
289	#ifndef JIT32_GCENCODER
290	getEmitter()->emitEnableGC();
291	#endif // JIT32_GCENCODER
292	}
293
294	#else // !FEATURE_EH_FUNCLETS
295
296	// If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
297	// corresponding to the finally's nesting level. When invoked in response to an exception, the
298	// EE does this.
299	//
300	// We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
301	//
302	// We will emit :
303	// mov [ebp - (n + 1)], 0
304	// mov [ebp - n ], 0xFC
305	// push &step
306	// jmp finallyBlock
307	// ...
308	// step:
309	// mov [ebp - n ], 0
310	// jmp leaveTarget
311	// ...
312	// leaveTarget:
313
314	noway_assert(isFramePointerUsed());
315
316	// Get the nesting level which contains the finally
317	unsigned finallyNesting = `0`;
318	compiler->fgGetNestingLevel(block, &finallyNesting);
319
320	// The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
321	unsigned filterEndOffsetSlotOffs;
322	filterEndOffsetSlotOffs = (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
323
324	unsigned curNestingSlotOffs;
325	curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + `1`) * TARGET_POINTER_SIZE));
326
327	// Zero out the slot for the next nesting level
328	instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, `0`, compiler->lvaShadowSPslotsVar,
329	curNestingSlotOffs - TARGET_POINTER_SIZE);
330	instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, LCL_FINALLY_MARK, compiler->lvaShadowSPslotsVar,
331	curNestingSlotOffs);
332
333	// Now push the address where the finally funclet should return to directly.
334	if (!(block->bbFlags & BBF_RETLESS_CALL))
335	{
336	assert(block->isBBCallAlwaysPair());
337	getEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
338	}
339	else
340	{
341	// EE expects a DWORD, so we give him 0
342	inst_IV(INS_push_hide, `0`);
343	}
344
345	// Jump to the finally BB
346	inst_JMP(EJ_jmp, block->bbJumpDest);
347
348	#endif // !FEATURE_EH_FUNCLETS
349
350	// The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
351	// jump target using bbJumpDest - that is already used to point
352	// to the finally block. So just skip past the BBJ_ALWAYS unless the
353	// block is RETLESS.
354	if (!(block->bbFlags & BBF_RETLESS_CALL))
355	{
356	assert(block->isBBCallAlwaysPair());
357	block = block->bbNext;
358	}
359	return block;
360	}
361
362	#if FEATURE_EH_FUNCLETS
363	void CodeGen::genEHCatchRet(BasicBlock* block)
364	{
365	// Set RAX to the address the VM should return to after the catch.
366	// Generate a RIP-relative
367	// lea reg, [rip + disp32] ; the RIP is implicit
368	// which will be position-independent.
369	getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
370	}
371
372	#else // !FEATURE_EH_FUNCLETS
373
374	void CodeGen::genEHFinallyOrFilterRet(BasicBlock* block)
375	{
376	// The last statement of the block must be a GT_RETFILT, which has already been generated.
377	assert(block->lastNode() != nullptr);
378	assert(block->lastNode()->OperGet() == GT_RETFILT);
379
380	if (block->bbJumpKind == BBJ_EHFINALLYRET)
381	{
382	assert(block->lastNode()->gtOp.gtOp1 == nullptr); // op1 == nullptr means endfinally
383
384	// Return using a pop-jmp sequence. As the "try" block calls
385	// the finally with a jmp, this leaves the x86 call-ret stack
386	// balanced in the normal flow of path.
387
388	noway_assert(isFramePointerRequired());
389	inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
390	inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
391	}
392	else
393	{
394	assert(block->bbJumpKind == BBJ_EHFILTERRET);
395
396	// The return value has already been computed.
397	instGen_Return(`0`);
398	}
399	}
400
401	#endif // !FEATURE_EH_FUNCLETS
402
403	// Move an immediate value into an integer register
404
405	void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
406	{
407	// reg cannot be a FP register
408	assert(!genIsValidFloatReg(reg));
409
410	if (!compiler->opts.compReloc)
411	{
412	size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
413	}
414
415	if ((imm == `0`) && !EA_IS_RELOC(size))
416	{
417	instGen_Set_Reg_To_Zero(size, reg, flags);
418	}
419	else
420	{
421	if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
422	{
423	emitAttr newSize = EA_PTR_DSP_RELOC;
424	if (EA_IS_BYREF(size))
425	{
426	newSize = EA_SET_FLG(newSize, EA_BYREF_FLG);
427	}
428
429	getEmitter()->emitIns_R_AI(INS_lea, newSize, reg, imm);
430	}
431	else
432	{
433	getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
434	}
435	}
436	regSet.verifyRegUsed(reg);
437	}
438
439	/***********************************************************************************
440	*
441	* Generate code to set a register 'targetReg' of type 'targetType' to the constant
442	* specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
443	* genProduceReg() on the target register.
444	*/
445	void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
446	{
447	switch (tree->gtOper)
448	{
449	case GT_CNS_INT:
450	{
451	// relocatable values tend to come down as a CNS_INT of native int type
452	// so the line between these two opcodes is kind of blurry
453	GenTreeIntConCommon* con = tree->AsIntConCommon();
454	ssize_t cnsVal = con->IconValue();
455
456	if (con->ImmedValNeedsReloc(compiler))
457	{
458	emitAttr size = EA_HANDLE_CNS_RELOC;
459
460	if (targetType == TYP_BYREF)
461	{
462	size = EA_SET_FLG(size, EA_BYREF_FLG);
463	}
464
465	instGen_Set_Reg_To_Imm(size, targetReg, cnsVal);
466	regSet.verifyRegUsed(targetReg);
467	}
468	else
469	{
470	genSetRegToIcon(targetReg, cnsVal, targetType);
471	}
472	}
473	break;
474
475	case GT_CNS_DBL:
476	{
477	emitter* emit = getEmitter();
478	emitAttr size = emitTypeSize(targetType);
479	double constValue = tree->gtDblCon.gtDconVal;
480
481	// Make sure we use "xorps reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
482	if ((__int64**)&constValue == `0`)
483	{
484	// A faster/smaller way to generate 0
485	emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg);
486	}
487	else
488	{
489	CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
490	emit->emitIns_R_C(ins_Load(targetType), size, targetReg, hnd, `0`);
491	}
492	}
493	break;
494
495	default:
496	unreached();
497	}
498	}
499
500	//------------------------------------------------------------------------
501	// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
502	//
503	// Arguments:
504	// tree - the node
505	//
506	void CodeGen::genCodeForNegNot(GenTree* tree)
507	{
508	assert(tree->OperIs(GT_NEG, GT_NOT));
509
510	regNumber targetReg = tree->gtRegNum;
511	var_types targetType = tree->TypeGet();
512
513	if (varTypeIsFloating(targetType))
514	{
515	assert(tree->gtOper == GT_NEG);
516	genSSE2BitwiseOp(tree);
517	}
518	else
519	{
520	GenTree* operand = tree->gtGetOp1();
521	assert(operand->isUsedFromReg());
522	regNumber operandReg = genConsumeReg(operand);
523
524	if (operandReg != targetReg)
525	{
526	inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
527	}
528
529	instruction ins = genGetInsForOper(tree->OperGet(), targetType);
530	inst_RV(ins, targetReg, targetType);
531	}
532
533	genProduceReg(tree);
534	}
535
536	//------------------------------------------------------------------------
537	// genCodeForBswap: Produce code for a GT_BSWAP / GT_BSWAP16 node.
538	//
539	// Arguments:
540	// tree - the node
541	//
542	void CodeGen::genCodeForBswap(GenTree* tree)
543	{
544	// TODO: If we're swapping immediately after a read from memory or immediately before
545	// a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
546	// the platform supports it.
547
548	assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));
549
550	regNumber targetReg = tree->gtRegNum;
551	var_types targetType = tree->TypeGet();
552
553	GenTree* operand = tree->gtGetOp1();
554	assert(operand->isUsedFromReg());
555	regNumber operandReg = genConsumeReg(operand);
556
557	if (operandReg != targetReg)
558	{
559	inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
560	}
561
562	if (tree->OperIs(GT_BSWAP))
563	{
564	// 32-bit and 64-bit byte swaps use "bswap reg"
565	inst_RV(INS_bswap, targetReg, targetType);
566	}
567	else
568	{
569	// 16-bit byte swaps use "ror reg.16, 8"
570	inst_RV_IV(INS_ror_N, targetReg, `8` / val /, emitAttr::EA_2BYTE);
571	}
572
573	genProduceReg(tree);
574	}
575
576	// Generate code to get the high N bits of a NN=2N bit multiplication result*
577	void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
578	{
579	assert(!treeNode->gtOverflowEx());
580
581	regNumber targetReg = treeNode->gtRegNum;
582	var_types targetType = treeNode->TypeGet();
583	emitter* emit = getEmitter();
584	emitAttr size = emitTypeSize(treeNode);
585	GenTree* op1 = treeNode->gtOp.gtOp1;
586	GenTree* op2 = treeNode->gtOp.gtOp2;
587
588	// to get the high bits of the multiply, we are constrained to using the
589	// 1-op form: RDX:RAX = RAX rm*
590	// The 3-op form (Rx=RyRz) does not support it.*
591
592	genConsumeOperands(treeNode->AsOp());
593
594	GenTree* regOp = op1;
595	GenTree* rmOp = op2;
596
597	// Set rmOp to the memory operand (if any)
598	if (op1->isUsedFromMemory() \|\| (op2->isUsedFromReg() && (op2->gtRegNum == REG_RAX)))
599	{
600	regOp = op2;
601	rmOp = op1;
602	}
603	assert(regOp->isUsedFromReg());
604
605	// Setup targetReg when neither of the source operands was a matching register
606	if (regOp->gtRegNum != REG_RAX)
607	{
608	inst_RV_RV(ins_Copy(targetType), REG_RAX, regOp->gtRegNum, targetType);
609	}
610
611	instruction ins;
612	if ((treeNode->gtFlags & GTF_UNSIGNED) == `0`)
613	{
614	ins = INS_imulEAX;
615	}
616	else
617	{
618	ins = INS_mulEAX;
619	}
620	emit->emitInsBinary(ins, size, treeNode, rmOp);
621
622	// Move the result to the desired register, if necessary
623	if (treeNode->OperGet() == GT_MULHI && targetReg != REG_RDX)
624	{
625	inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
626	}
627
628	genProduceReg(treeNode);
629	}
630
631	#ifdef _TARGET_X86_
632	//------------------------------------------------------------------------
633	// genCodeForLongUMod: Generate code for a tree of the form
634	// `(umod (gt_long x y) (const int))`
635	//
636	// Arguments:
637	// node - the node for which to generate code
638	//
639	void CodeGen::genCodeForLongUMod(GenTreeOp* node)
640	{
641	assert(node != nullptr);
642	assert(node->OperGet() == GT_UMOD);
643	assert(node->TypeGet() == TYP_INT);
644
645	GenTreeOp* const dividend = node->gtOp1->AsOp();
646	assert(dividend->OperGet() == GT_LONG);
647	assert(varTypeIsLong(dividend));
648
649	genConsumeOperands(node);
650
651	GenTree* const dividendLo = dividend->gtOp1;
652	GenTree* const dividendHi = dividend->gtOp2;
653	assert(dividendLo->isUsedFromReg());
654	assert(dividendHi->isUsedFromReg());
655
656	GenTree* const divisor = node->gtOp2;
657	assert(divisor->gtSkipReloadOrCopy()->OperGet() == GT_CNS_INT);
658	assert(divisor->gtSkipReloadOrCopy()->isUsedFromReg());
659	assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal >= `2`);
660	assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal <= `0x3fffffff`);
661
662	// dividendLo must be in RAX; dividendHi must be in RDX
663	genCopyRegIfNeeded(dividendLo, REG_EAX);
664	genCopyRegIfNeeded(dividendHi, REG_EDX);
665
666	// At this point, EAX:EDX contains the 64bit dividend and op2->gtRegNum
667	// contains the 32bit divisor. We want to generate the following code:
668	//
669	// cmp edx, divisor->gtRegNum
670	// jb noOverflow
671	//
672	// mov temp, eax
673	// mov eax, edx
674	// xor edx, edx
675	// div divisor->gtRegNum
676	// mov eax, temp
677	//
678	// noOverflow:
679	// div divisor->gtRegNum
680	//
681	// This works because (a 2^32 + b) % c = ((a % c) * 2^32 + b) % c.*
682
683	BasicBlock* const noOverflow = genCreateTempLabel();
684
685	// cmp edx, divisor->gtRegNum
686	// jb noOverflow
687	inst_RV_RV(INS_cmp, REG_EDX, divisor->gtRegNum);
688	inst_JMP(EJ_jb, noOverflow);
689
690	// mov temp, eax
691	// mov eax, edx
692	// xor edx, edx
693	// div divisor->gtRegNum
694	// mov eax, temp
695	const regNumber tempReg = node->GetSingleTempReg();
696	inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
697	inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
698	instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
699	inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
700	inst_RV_RV(INS_mov, REG_EAX, tempReg, TYP_INT);
701
702	// noOverflow:
703	// div divisor->gtRegNum
704	genDefineTempLabel(noOverflow);
705	inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
706
707	const regNumber targetReg = node->gtRegNum;
708	if (targetReg != REG_EDX)
709	{
710	inst_RV_RV(INS_mov, targetReg, REG_RDX, TYP_INT);
711	}
712	genProduceReg(node);
713	}
714	#endif // _TARGET_X86_
715
716	//------------------------------------------------------------------------
717	// genCodeForDivMod: Generate code for a DIV or MOD operation.
718	//
719	// Arguments:
720	// treeNode - the node to generate the code for
721	//
722	void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
723	{
724	assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
725
726	GenTree* dividend = treeNode->gtOp1;
727
728	#ifdef _TARGET_X86_
729	if (varTypeIsLong(dividend->TypeGet()))
730	{
731	genCodeForLongUMod(treeNode);
732	return;
733	}
734	#endif // _TARGET_X86_
735
736	GenTree* divisor = treeNode->gtOp2;
737	genTreeOps oper = treeNode->OperGet();
738	emitAttr size = emitTypeSize(treeNode);
739	regNumber targetReg = treeNode->gtRegNum;
740	var_types targetType = treeNode->TypeGet();
741	emitter* emit = getEmitter();
742
743	// Node's type must be int/native int, small integer types are not
744	// supported and floating point types are handled by genCodeForBinary.
745	assert(varTypeIsIntOrI(targetType));
746	// dividend is in a register.
747	assert(dividend->isUsedFromReg());
748
749	genConsumeOperands(treeNode->AsOp());
750	// dividend must be in RAX
751	genCopyRegIfNeeded(dividend, REG_RAX);
752
753	// zero or sign extend rax to rdx
754	if (oper == GT_UMOD \|\| oper == GT_UDIV)
755	{
756	instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
757	}
758	else
759	{
760	emit->emitIns(INS_cdq, size);
761	// the cdq instruction writes RDX, So clear the gcInfo for RDX
762	gcInfo.gcMarkRegSetNpt(RBM_RDX);
763	}
764
765	// Perform the 'targetType' (64-bit or 32-bit) divide instruction
766	instruction ins;
767	if (oper == GT_UMOD \|\| oper == GT_UDIV)
768	{
769	ins = INS_div;
770	}
771	else
772	{
773	ins = INS_idiv;
774	}
775
776	emit->emitInsBinary(ins, size, treeNode, divisor);
777
778	// DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
779	// Move the result to the desired register, if necessary
780	if (oper == GT_DIV \|\| oper == GT_UDIV)
781	{
782	if (targetReg != REG_RAX)
783	{
784	inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
785	}
786	}
787	else
788	{
789	assert((oper == GT_MOD) \|\| (oper == GT_UMOD));
790	if (targetReg != REG_RDX)
791	{
792	inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
793	}
794	}
795	genProduceReg(treeNode);
796	}
797
798	//------------------------------------------------------------------------
799	// genCodeForBinary: Generate code for many binary arithmetic operators
800	//
801	// Arguments:
802	// treeNode - The binary operation for which we are generating code.
803	//
804	// Return Value:
805	// None.
806	//
807	// Notes:
808	// Integer MUL and DIV variants have special constraints on x64 so are not handled here.
809	// See the assert below for the operators that are handled.
810
811	void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
812	{
813	#ifdef DEBUG
814	bool isValidOper = treeNode->OperIs(GT_ADD, GT_SUB);
815	if (varTypeIsFloating(treeNode->TypeGet()))
816	{
817	isValidOper \|= treeNode->OperIs(GT_MUL, GT_DIV);
818	}
819	else
820	{
821	isValidOper \|= treeNode->OperIs(GT_AND, GT_OR, GT_XOR);
822	#ifndef _TARGET_64BIT_
823	isValidOper \|= treeNode->OperIs(GT_ADD_LO, GT_ADD_HI, GT_SUB_LO, GT_SUB_HI);
824	#endif
825	}
826	assert(isValidOper);
827	#endif
828
829	genConsumeOperands(treeNode);
830
831	const genTreeOps oper = treeNode->OperGet();
832	regNumber targetReg = treeNode->gtRegNum;
833	var_types targetType = treeNode->TypeGet();
834	emitter* emit = getEmitter();
835
836	GenTree* op1 = treeNode->gtGetOp1();
837	GenTree* op2 = treeNode->gtGetOp2();
838
839	// Commutative operations can mark op1 as contained or reg-optional to generate "op reg, memop/immed"
840	if (!op1->isUsedFromReg())
841	{
842	assert(treeNode->OperIsCommutative());
843	assert(op1->isMemoryOp() \|\| op1->IsLocal() \|\| op1->IsCnsNonZeroFltOrDbl() \|\| op1->IsIntCnsFitsInI32() \|\|
844	op1->IsRegOptional());
845
846	op1 = treeNode->gtGetOp2();
847	op2 = treeNode->gtGetOp1();
848	}
849
850	instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
851
852	// The arithmetic node must be sitting in a register (since it's not contained)
853	noway_assert(targetReg != REG_NA);
854
855	regNumber op1reg = op1->isUsedFromReg() ? op1->gtRegNum : REG_NA;
856	regNumber op2reg = op2->isUsedFromReg() ? op2->gtRegNum : REG_NA;
857
858	GenTree* dst;
859	GenTree* src;
860
861	// This is the case of reg1 = reg1 op reg2
862	// We're ready to emit the instruction without any moves
863	if (op1reg == targetReg)
864	{
865	dst = op1;
866	src = op2;
867	}
868	// We have reg1 = reg2 op reg1
869	// In order for this operation to be correct
870	// we need that op is a commutative operation so
871	// we can convert it into reg1 = reg1 op reg2 and emit
872	// the same code as above
873	else if (op2reg == targetReg)
874	{
875	noway_assert(GenTree::OperIsCommutative(oper));
876	dst = op2;
877	src = op1;
878	}
879	// now we know there are 3 different operands so attempt to use LEA
880	else if (oper == GT_ADD && !varTypeIsFloating(treeNode) && !treeNode->gtOverflowEx() // LEA does not set flags
881	&& (op2->isContainedIntOrIImmed() \|\| op2->isUsedFromReg()) && !treeNode->gtSetFlags())
882	{
883	if (op2->isContainedIntOrIImmed())
884	{
885	emit->emitIns_R_AR(INS_lea, emitTypeSize(treeNode), targetReg, op1reg,
886	(int)op2->AsIntConCommon()->IconValue());
887	}
888	else
889	{
890	assert(op2reg != REG_NA);
891	emit->emitIns_R_ARX(INS_lea, emitTypeSize(treeNode), targetReg, op1reg, op2reg, `1`, `0`);
892	}
893	genProduceReg(treeNode);
894	return;
895	}
896	// dest, op1 and op2 registers are different:
897	// reg3 = reg1 op reg2
898	// We can implement this by issuing a mov:
899	// reg3 = reg1
900	// reg3 = reg3 op reg2
901	else
902	{
903	inst_RV_RV(ins_Copy(targetType), targetReg, op1reg, targetType);
904	regSet.verifyRegUsed(targetReg);
905	gcInfo.gcMarkRegPtrVal(targetReg, targetType);
906	dst = treeNode;
907	src = op2;
908	}
909
910	// try to use an inc or dec
911	if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
912	{
913	if (src->IsIntegralConst(`1`))
914	{
915	emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
916	genProduceReg(treeNode);
917	return;
918	}
919	else if (src->IsIntegralConst(-`1`))
920	{
921	emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
922	genProduceReg(treeNode);
923	return;
924	}
925	}
926	regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
927	noway_assert(r == targetReg);
928
929	if (treeNode->gtOverflowEx())
930	{
931	#if !defined(_TARGET_64BIT_)
932	assert(oper == GT_ADD \|\| oper == GT_SUB \|\| oper == GT_ADD_HI \|\| oper == GT_SUB_HI);
933	#else
934	assert(oper == GT_ADD \|\| oper == GT_SUB);
935	#endif
936	genCheckOverflow(treeNode);
937	}
938	genProduceReg(treeNode);
939	}
940
941	//------------------------------------------------------------------------
942	// genCodeForMul: Generate code for a MUL operation.
943	//
944	// Arguments:
945	// treeNode - the node to generate the code for
946	//
947	void CodeGen::genCodeForMul(GenTreeOp* treeNode)
948	{
949	assert(treeNode->OperIs(GT_MUL));
950
951	regNumber targetReg = treeNode->gtRegNum;
952	var_types targetType = treeNode->TypeGet();
953	emitter* emit = getEmitter();
954
955	// Node's type must be int or long (only on x64), small integer types are not
956	// supported and floating point types are handled by genCodeForBinary.
957	assert(varTypeIsIntOrI(targetType));
958
959	instruction ins;
960	emitAttr size = emitTypeSize(treeNode);
961	bool isUnsignedMultiply = ((treeNode->gtFlags & GTF_UNSIGNED) != `0`);
962	bool requiresOverflowCheck = treeNode->gtOverflowEx();
963
964	GenTree* op1 = treeNode->gtGetOp1();
965	GenTree* op2 = treeNode->gtGetOp2();
966
967	// there are 3 forms of x64 multiply:
968	// 1-op form with 128 result: RDX:RAX = RAX rm*
969	// 2-op form: reg = rm*
970	// 3-op form: reg = rm imm*
971
972	genConsumeOperands(treeNode);
973
974	// This matches the 'mul' lowering in Lowering::SetMulOpCounts()
975	//
976	// immOp :: Only one operand can be an immediate
977	// rmOp :: Only one operand can be a memory op.
978	// regOp :: A register op (especially the operand that matches 'targetReg')
979	// (can be nullptr when we have both a memory op and an immediate op)
980
981	GenTree* immOp = nullptr;
982	GenTree* rmOp = op1;
983	GenTree* regOp;
984
985	if (op2->isContainedIntOrIImmed())
986	{
987	immOp = op2;
988	}
989	else if (op1->isContainedIntOrIImmed())
990	{
991	immOp = op1;
992	rmOp = op2;
993	}
994
995	if (immOp != nullptr)
996	{
997	// CQ: When possible use LEA for mul by imm 3, 5 or 9
998	ssize_t imm = immOp->AsIntConCommon()->IconValue();
999
1000	if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == `3`) \|\| (imm == `5`) \|\| (imm == `9`)))
1001	{
1002	// We will use the LEA instruction to perform this multiply
1003	// Note that an LEA with base=x, index=x and scale=(imm-1) computes ximm when imm=3,5 or 9.*
1004	unsigned int scale = (unsigned int)(imm - `1`);
1005	getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, `0`);
1006	}
1007	else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != `0`))
1008	{
1009	// Use shift for constant multiply when legal
1010	uint64_t zextImm = static_cast<uint64_t>(static_cast<size_t>(imm));
1011	unsigned int shiftAmount = genLog2(zextImm);
1012
1013	if (targetReg != rmOp->gtRegNum)
1014	{
1015	// Copy reg src to dest register
1016	inst_RV_RV(INS_mov, targetReg, rmOp->gtRegNum, targetType);
1017	}
1018	inst_RV_SH(INS_shl, size, targetReg, shiftAmount);
1019	}
1020	else
1021	{
1022	// use the 3-op form with immediate
1023	ins = getEmitter()->inst3opImulForReg(targetReg);
1024	emit->emitInsBinary(ins, size, rmOp, immOp);
1025	}
1026	}
1027	else // we have no contained immediate operand
1028	{
1029	regOp = op1;
1030	rmOp = op2;
1031
1032	regNumber mulTargetReg = targetReg;
1033	if (isUnsignedMultiply && requiresOverflowCheck)
1034	{
1035	ins = INS_mulEAX;
1036	mulTargetReg = REG_RAX;
1037	}
1038	else
1039	{
1040	ins = INS_imul;
1041	}
1042
1043	// Set rmOp to the memory operand (if any)
1044	// or set regOp to the op2 when it has the matching target register for our multiply op
1045	//
1046	if (op1->isUsedFromMemory() \|\| (op2->isUsedFromReg() && (op2->gtRegNum == mulTargetReg)))
1047	{
1048	regOp = op2;
1049	rmOp = op1;
1050	}
1051	assert(regOp->isUsedFromReg());
1052
1053	// Setup targetReg when neither of the source operands was a matching register
1054	if (regOp->gtRegNum != mulTargetReg)
1055	{
1056	inst_RV_RV(INS_mov, mulTargetReg, regOp->gtRegNum, targetType);
1057	}
1058
1059	emit->emitInsBinary(ins, size, treeNode, rmOp);
1060
1061	// Move the result to the desired register, if necessary
1062	if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
1063	{
1064	inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
1065	}
1066	}
1067
1068	if (requiresOverflowCheck)
1069	{
1070	// Overflow checking is only used for non-floating point types
1071	noway_assert(!varTypeIsFloating(treeNode));
1072
1073	genCheckOverflow(treeNode);
1074	}
1075
1076	genProduceReg(treeNode);
1077	}
1078
1079	//------------------------------------------------------------------------
1080	// isStructReturn: Returns whether the 'treeNode' is returning a struct.
1081	//
1082	// Arguments:
1083	// treeNode - The tree node to evaluate whether is a struct return.
1084	//
1085	// Return Value:
1086	// For AMD64 nix: returns true if the 'treeNode" is a GT_RETURN node, of type struct.*
1087	// Otherwise returns false.
1088	// For other platforms always returns false.
1089	//
1090	bool CodeGen::isStructReturn(GenTree* treeNode)
1091	{
1092	// This method could be called for 'treeNode' of GT_RET_FILT or GT_RETURN.
1093	// For the GT_RET_FILT, the return is always
1094	// a bool or a void, for the end of a finally block.
1095	noway_assert(treeNode->OperGet() == GT_RETURN \|\| treeNode->OperGet() == GT_RETFILT);
1096	if (treeNode->OperGet() != GT_RETURN)
1097	{
1098	return false;
1099	}
1100
1101	#ifdef UNIX_AMD64_ABI
1102	return varTypeIsStruct(treeNode);
1103	#else // !UNIX_AMD64_ABI
1104	assert(!varTypeIsStruct(treeNode));
1105	return false;
1106	#endif // UNIX_AMD64_ABI
1107	}
1108
1109	//------------------------------------------------------------------------
1110	// genStructReturn: Generates code for returning a struct.
1111	//
1112	// Arguments:
1113	// treeNode - The GT_RETURN tree node.
1114	//
1115	// Return Value:
1116	// None
1117	//
1118	// Assumption:
1119	// op1 of GT_RETURN node is either GT_LCL_VAR or multi-reg GT_CALL
1120	void CodeGen::genStructReturn(GenTree* treeNode)
1121	{
1122	assert(treeNode->OperGet() == GT_RETURN);
1123	GenTree* op1 = treeNode->gtGetOp1();
1124
1125	#ifdef UNIX_AMD64_ABI
1126	if (op1->OperGet() == GT_LCL_VAR)
1127	{
1128	GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon();
1129	LclVarDsc* varDsc = &(compiler->lvaTable[lclVar->gtLclNum]);
1130	assert(varDsc->lvIsMultiRegRet);
1131
1132	ReturnTypeDesc retTypeDesc;
1133	retTypeDesc.InitializeStructReturnType(compiler, varDsc->lvVerTypeInfo.GetClassHandle());
1134	unsigned regCount = retTypeDesc.GetReturnRegCount();
1135	assert(regCount == MAX_RET_REG_COUNT);
1136
1137	if (varTypeIsEnregisterableStruct(op1))
1138	{
1139	// Right now the only enregistrable structs supported are SIMD vector types.
1140	assert(varTypeIsSIMD(op1));
1141	assert(op1->isUsedFromReg());
1142
1143	// This is a case of operand is in a single reg and needs to be
1144	// returned in multiple ABI return registers.
1145	regNumber opReg = genConsumeReg(op1);
1146	regNumber reg0 = retTypeDesc.GetABIReturnReg(`0`);
1147	regNumber reg1 = retTypeDesc.GetABIReturnReg(`1`);
1148
1149	if (opReg != reg0 && opReg != reg1)
1150	{
1151	// Operand reg is different from return regs.
1152	// Copy opReg to reg0 and let it to be handled by one of the
1153	// two cases below.
1154	inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
1155	opReg = reg0;
1156	}
1157
1158	if (opReg == reg0)
1159	{
1160	assert(opReg != reg1);
1161
1162	// reg0 - already has required 8-byte in bit position [63:0].
1163	// reg1 = opReg.
1164	// swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
1165	inst_RV_RV(ins_Copy(TYP_DOUBLE), reg1, opReg, TYP_DOUBLE);
1166	}
1167	else
1168	{
1169	assert(opReg == reg1);
1170
1171	// reg0 = opReg.
1172	// swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
1173	inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
1174	}
1175	inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, `0x01`);
1176	}
1177	else
1178	{
1179	assert(op1->isUsedFromMemory());
1180
1181	// Copy var on stack into ABI return registers
1182	int offset = `0`;
1183	for (unsigned i = `0`; i < regCount; ++i)
1184	{
1185	var_types type = retTypeDesc.GetReturnRegType(i);
1186	regNumber reg = retTypeDesc.GetABIReturnReg(i);
1187	getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), reg, lclVar->gtLclNum, offset);
1188	offset += genTypeSize(type);
1189	}
1190	}
1191	}
1192	else
1193	{
1194	assert(op1->IsMultiRegCall() \|\| op1->IsCopyOrReloadOfMultiRegCall());
1195
1196	genConsumeRegs(op1);
1197
1198	GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
1199	GenTreeCall* call = actualOp1->AsCall();
1200	ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
1201	unsigned regCount = retTypeDesc->GetReturnRegCount();
1202	assert(regCount == MAX_RET_REG_COUNT);
1203
1204	// Handle circular dependency between call allocated regs and ABI return regs.
1205	//
1206	// It is possible under LSRA stress that originally allocated regs of call node,
1207	// say rax and rdx, are spilled and reloaded to rdx and rax respectively. But
1208	// GT_RETURN needs to move values as follows: rdx->rax, rax->rdx. Similar kind
1209	// kind of circular dependency could arise between xmm0 and xmm1 return regs.
1210	// Codegen is expected to handle such circular dependency.
1211	//
1212	var_types regType0 = retTypeDesc->GetReturnRegType(`0`);
1213	regNumber returnReg0 = retTypeDesc->GetABIReturnReg(`0`);
1214	regNumber allocatedReg0 = call->GetRegNumByIdx(`0`);
1215
1216	var_types regType1 = retTypeDesc->GetReturnRegType(`1`);
1217	regNumber returnReg1 = retTypeDesc->GetABIReturnReg(`1`);
1218	regNumber allocatedReg1 = call->GetRegNumByIdx(`1`);
1219
1220	if (op1->IsCopyOrReload())
1221	{
1222	// GT_COPY/GT_RELOAD will have valid reg for those positions
1223	// that need to be copied or reloaded.
1224	regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(`0`);
1225	if (reloadReg != REG_NA)
1226	{
1227	allocatedReg0 = reloadReg;
1228	}
1229
1230	reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(`1`);
1231	if (reloadReg != REG_NA)
1232	{
1233	allocatedReg1 = reloadReg;
1234	}
1235	}
1236
1237	if (allocatedReg0 == returnReg1 && allocatedReg1 == returnReg0)
1238	{
1239	// Circular dependency - swap allocatedReg0 and allocatedReg1
1240	if (varTypeIsFloating(regType0))
1241	{
1242	assert(varTypeIsFloating(regType1));
1243
1244	// The fastest way to swap two XMM regs is using PXOR
1245	inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
1246	inst_RV_RV(INS_pxor, allocatedReg1, allocatedReg0, TYP_DOUBLE);
1247	inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
1248	}
1249	else
1250	{
1251	assert(varTypeIsIntegral(regType0));
1252	assert(varTypeIsIntegral(regType1));
1253	inst_RV_RV(INS_xchg, allocatedReg1, allocatedReg0, TYP_I_IMPL);
1254	}
1255	}
1256	else if (allocatedReg1 == returnReg0)
1257	{
1258	// Change the order of moves to correctly handle dependency.
1259	if (allocatedReg1 != returnReg1)
1260	{
1261	inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
1262	}
1263
1264	if (allocatedReg0 != returnReg0)
1265	{
1266	inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
1267	}
1268	}
1269	else
1270	{
1271	// No circular dependency case.
1272	if (allocatedReg0 != returnReg0)
1273	{
1274	inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
1275	}
1276
1277	if (allocatedReg1 != returnReg1)
1278	{
1279	inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
1280	}
1281	}
1282	}
1283	#else
1284	unreached();
1285	#endif
1286	}
1287
1288	#if defined(_TARGET_X86_)
1289
1290	//------------------------------------------------------------------------
1291	// genFloatReturn: Generates code for float return statement for x86.
1292	//
1293	// Note: treeNode's and op1's registers are already consumed.
1294	//
1295	// Arguments:
1296	// treeNode - The GT_RETURN or GT_RETFILT tree node with float type.
1297	//
1298	// Return Value:
1299	// None
1300	//
1301	void CodeGen::genFloatReturn(GenTree* treeNode)
1302	{
1303	assert(treeNode->OperGet() == GT_RETURN \|\| treeNode->OperGet() == GT_RETFILT);
1304	assert(varTypeIsFloating(treeNode));
1305
1306	GenTree* op1 = treeNode->gtGetOp1();
1307	// Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
1308	// If it already has a home location, use that. Otherwise, we need a temp.
1309	if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvOnFrame)
1310	{
1311	if (compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegNum != REG_STK)
1312	{
1313	op1->gtFlags \|= GTF_SPILL;
1314	inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1,
1315	op1->gtRegNum);
1316	}
1317	// Now, load it to the fp stack.
1318	getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, `0`);
1319	}
1320	else
1321	{
1322	// Spill the value, which should be in a register, then load it to the fp stack.
1323	// TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
1324	op1->gtFlags \|= GTF_SPILL;
1325	regSet.rsSpillTree(op1->gtRegNum, op1);
1326	op1->gtFlags \|= GTF_SPILLED;
1327	op1->gtFlags &= ~GTF_SPILL;
1328
1329	TempDsc* t = regSet.rsUnspillInPlace(op1, op1->gtRegNum);
1330	inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, `0`);
1331	op1->gtFlags &= ~GTF_SPILLED;
1332	regSet.tmpRlsTemp(t);
1333	}
1334	}
1335	#endif // _TARGET_X86_
1336
1337	//------------------------------------------------------------------------
1338	// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
1339	//
1340	// Arguments:
1341	// tree - the node
1342	//
1343	void CodeGen::genCodeForCompare(GenTreeOp* tree)
1344	{
1345	assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
1346
1347	// TODO-XArch-CQ: Check if we can use the currently set flags.
1348	// TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
1349	// (signed < or >= where targetReg != REG_NA)
1350
1351	GenTree* op1 = tree->gtOp1;
1352	var_types op1Type = op1->TypeGet();
1353
1354	if (varTypeIsFloating(op1Type))
1355	{
1356	genCompareFloat(tree);
1357	}
1358	else
1359	{
1360	genCompareInt(tree);
1361	}
1362	}
1363
1364	//------------------------------------------------------------------------
1365	// genCodeForBT: Generates code for a GT_BT node.
1366	//
1367	// Arguments:
1368	// tree - The node.
1369	//
1370	void CodeGen::genCodeForBT(GenTreeOp* bt)
1371	{
1372	assert(bt->OperIs(GT_BT));
1373
1374	GenTree* op1 = bt->gtGetOp1();
1375	GenTree* op2 = bt->gtGetOp2();
1376	var_types type = genActualType(op1->TypeGet());
1377
1378	assert(op1->isUsedFromReg() && op2->isUsedFromReg());
1379	assert((genTypeSize(type) >= genTypeSize(TYP_INT)) && (genTypeSize(type) <= genTypeSize(TYP_I_IMPL)));
1380
1381	genConsumeOperands(bt);
1382	// Note that the emitter doesn't fully support INS_bt, it only supports the reg,reg
1383	// form and encodes the registers in reverse order. To get the correct order we need
1384	// to reverse the operands when calling emitIns_R_R.
1385	getEmitter()->emitIns_R_R(INS_bt, emitTypeSize(type), op2->gtRegNum, op1->gtRegNum);
1386	}
1387
1388	//------------------------------------------------------------------------
1389	// genCodeForJumpTrue: Generates code for jmpTrue statement.
1390	//
1391	// Arguments:
1392	// tree - The GT_JTRUE tree node.
1393	//
1394	// Return Value:
1395	// None
1396	//
1397	void CodeGen::genCodeForJumpTrue(GenTree* tree)
1398	{
1399	GenTree* cmp = tree->gtOp.gtOp1;
1400
1401	assert(cmp->OperIsCompare());
1402	assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
1403
1404	#if !defined(_TARGET_64BIT_)
1405	// Long-typed compares should have been handled by Lowering::LowerCompare.
1406	assert(!varTypeIsLong(cmp->gtGetOp1()));
1407	#endif
1408
1409	// Get the "kind" and type of the comparison. Note that whether it is an unsigned cmp
1410	// is governed by a flag NOT by the inherent type of the node
1411	// TODO-XArch-CQ: Check if we can use the currently set flags.
1412	emitJumpKind jumpKind[`2`];
1413	bool branchToTrueLabel[`2`];
1414	genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
1415
1416	BasicBlock* skipLabel = nullptr;
1417	if (jumpKind[`0`] != EJ_NONE)
1418	{
1419	BasicBlock* jmpTarget;
1420	if (branchToTrueLabel[`0`])
1421	{
1422	jmpTarget = compiler->compCurBB->bbJumpDest;
1423	}
1424	else
1425	{
1426	// This case arises only for ordered GT_EQ right now
1427	assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == `0`));
1428	skipLabel = genCreateTempLabel();
1429	jmpTarget = skipLabel;
1430	}
1431
1432	inst_JMP(jumpKind[`0`], jmpTarget);
1433	}
1434
1435	if (jumpKind[`1`] != EJ_NONE)
1436	{
1437	// the second conditional branch always has to be to the true label
1438	assert(branchToTrueLabel[`1`]);
1439	inst_JMP(jumpKind[`1`], compiler->compCurBB->bbJumpDest);
1440	}
1441
1442	if (skipLabel != nullptr)
1443	{
1444	genDefineTempLabel(skipLabel);
1445	}
1446	}
1447
1448	//------------------------------------------------------------------------
1449	// genCodeForJcc: Produce code for a GT_JCC node.
1450	//
1451	// Arguments:
1452	// tree - the node
1453	//
1454	void CodeGen::genCodeForJcc(GenTreeCC* tree)
1455	{
1456	assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
1457
1458	CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != `0`) ? CK_UNSIGNED : CK_SIGNED;
1459	emitJumpKind jumpKind = genJumpKindForOper(tree->gtCondition, compareKind);
1460
1461	inst_JMP(jumpKind, compiler->compCurBB->bbJumpDest);
1462	}
1463
1464	//------------------------------------------------------------------------
1465	// genCodeForSetcc: Generates a setcc instruction for a GT_SETCC node.
1466	//
1467	// Arguments:
1468	// tree - the GT_SETCC node
1469	//
1470	// Assumptions:
1471	// The condition represents an integer comparison. This code doesn't
1472	// have the necessary logic to deal with floating point comparisons,
1473	// in fact it doesn't even know if the comparison is integer or floating
1474	// point because SETCC nodes do not have any operands.
1475	//
1476
1477	void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
1478	{
1479	regNumber dstReg = setcc->gtRegNum;
1480	CompareKind compareKind = setcc->IsUnsigned() ? CK_UNSIGNED : CK_SIGNED;
1481	emitJumpKind jumpKind = genJumpKindForOper(setcc->gtCondition, compareKind);
1482
1483	assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
1484	// Make sure nobody is setting GTF_RELOP_NAN_UN on this node as it is ignored.
1485	assert((setcc->gtFlags & GTF_RELOP_NAN_UN) == `0`);
1486
1487	inst_SET(jumpKind, dstReg);
1488	inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
1489	genProduceReg(setcc);
1490	}
1491
1492	//------------------------------------------------------------------------
1493	// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
1494	//
1495	// Arguments:
1496	// tree - the GT_RETURNTRAP node
1497	//
1498	void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
1499	{
1500	assert(tree->OperGet() == GT_RETURNTRAP);
1501
1502	// this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
1503	// based on the contents of 'data'
1504
1505	GenTree* data = tree->gtOp1;
1506	genConsumeRegs(data);
1507	GenTreeIntCon cns = intForm(TYP_INT, `0`);
1508	cns.SetContained();
1509	getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
1510
1511	BasicBlock* skipLabel = genCreateTempLabel();
1512
1513	emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
1514	inst_JMP(jmpEqual, skipLabel);
1515
1516	// emit the call to the EE-helper that stops for GC (or other reasons)
1517	regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
1518	assert(genIsValidIntReg(tmpReg));
1519
1520	genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, `0`, EA_UNKNOWN, tmpReg);
1521	genDefineTempLabel(skipLabel);
1522	}
1523
1524	/*****************************************************************************
1525	*
1526	* Generate code for a single node in the tree.
1527	* Preconditions: All operands have been evaluated
1528	*
1529	*/
1530	void CodeGen::genCodeForTreeNode(GenTree* treeNode)
1531	{
1532	regNumber targetReg;
1533	#if !defined(_TARGET_64BIT_)
1534	if (treeNode->TypeGet() == TYP_LONG)
1535	{
1536	// All long enregistered nodes will have been decomposed into their
1537	// constituent lo and hi nodes.
1538	targetReg = REG_NA;
1539	}
1540	else
1541	#endif // !defined(_TARGET_64BIT_)
1542	{
1543	targetReg = treeNode->gtRegNum;
1544	}
1545	var_types targetType = treeNode->TypeGet();
1546	emitter* emit = getEmitter();
1547
1548	#ifdef DEBUG
1549	// Validate that all the operands for the current node are consumed in order.
1550	// This is important because LSRA ensures that any necessary copies will be
1551	// handled correctly.
1552	lastConsumedNode = nullptr;
1553	if (compiler->verbose)
1554	{
1555	unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio
1556	compiler->gtDispLIRNode(treeNode, "Generating: ");
1557	}
1558	#endif // DEBUG
1559
1560	// Is this a node whose value is already in a register? LSRA denotes this by
1561	// setting the GTF_REUSE_REG_VAL flag.
1562	if (treeNode->IsReuseRegVal())
1563	{
1564	// For now, this is only used for constant nodes.
1565	assert((treeNode->OperIsConst()));
1566	JITDUMP(" TreeNode is marked ReuseReg\n");
1567	return;
1568	}
1569
1570	// contained nodes are part of their parents for codegen purposes
1571	// ex : immediates, most LEAs
1572	if (treeNode->isContained())
1573	{
1574	return;
1575	}
1576
1577	switch (treeNode->gtOper)
1578	{
1579	#ifndef JIT32_GCENCODER
1580	case GT_START_NONGC:
1581	getEmitter()->emitDisableGC();
1582	break;
1583	#endif // !defined(JIT32_GCENCODER)
1584
1585	case GT_PROF_HOOK:
1586	#ifdef PROFILING_SUPPORTED
1587	// We should be seeing this only if profiler hook is needed
1588	noway_assert(compiler->compIsProfilerHookNeeded());
1589
1590	// Right now this node is used only for tail calls. In future if
1591	// we intend to use it for Enter or Leave hooks, add a data member
1592	// to this node indicating the kind of profiler hook. For example,
1593	// helper number can be used.
1594	genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
1595	#endif // PROFILING_SUPPORTED
1596	break;
1597
1598	case GT_LCLHEAP:
1599	genLclHeap(treeNode);
1600	break;
1601
1602	case GT_CNS_INT:
1603	#ifdef _TARGET_X86_
1604	assert(!treeNode->IsIconHandle(GTF_ICON_TLS_HDL));
1605	#endif // _TARGET_X86_
1606	__fallthrough;
1607
1608	case GT_CNS_DBL:
1609	genSetRegToConst(targetReg, targetType, treeNode);
1610	genProduceReg(treeNode);
1611	break;
1612
1613	case GT_NOT:
1614	case GT_NEG:
1615	genCodeForNegNot(treeNode);
1616	break;
1617
1618	case GT_BSWAP:
1619	case GT_BSWAP16:
1620	genCodeForBswap(treeNode);
1621	break;
1622
1623	case GT_DIV:
1624	if (varTypeIsFloating(treeNode->TypeGet()))
1625	{
1626	genCodeForBinary(treeNode->AsOp());
1627	break;
1628	}
1629	__fallthrough;
1630	case GT_MOD:
1631	case GT_UMOD:
1632	case GT_UDIV:
1633	genCodeForDivMod(treeNode->AsOp());
1634	break;
1635
1636	case GT_OR:
1637	case GT_XOR:
1638	case GT_AND:
1639	assert(varTypeIsIntegralOrI(treeNode));
1640
1641	__fallthrough;
1642
1643	#if !defined(_TARGET_64BIT_)
1644	case GT_ADD_LO:
1645	case GT_ADD_HI:
1646	case GT_SUB_LO:
1647	case GT_SUB_HI:
1648	#endif // !defined(_TARGET_64BIT_)
1649
1650	case GT_ADD:
1651	case GT_SUB:
1652	genCodeForBinary(treeNode->AsOp());
1653	break;
1654
1655	case GT_MUL:
1656	if (varTypeIsFloating(treeNode->TypeGet()))
1657	{
1658	genCodeForBinary(treeNode->AsOp());
1659	break;
1660	}
1661	genCodeForMul(treeNode->AsOp());
1662	break;
1663
1664	case GT_LSH:
1665	case GT_RSH:
1666	case GT_RSZ:
1667	case GT_ROL:
1668	case GT_ROR:
1669	genCodeForShift(treeNode);
1670	break;
1671
1672	#if !defined(_TARGET_64BIT_)
1673
1674	case GT_LSH_HI:
1675	case GT_RSH_LO:
1676	genCodeForShiftLong(treeNode);
1677	break;
1678
1679	#endif // !defined(_TARGET_64BIT_)
1680
1681	case GT_CAST:
1682	genCodeForCast(treeNode->AsOp());
1683	break;
1684
1685	case GT_BITCAST:
1686	{
1687	GenTree* const op1 = treeNode->AsOp()->gtOp1;
1688	genConsumeReg(op1);
1689
1690	const bool srcFltReg = varTypeIsFloating(op1) \|\| varTypeIsSIMD(op1);
1691	const bool dstFltReg = varTypeIsFloating(treeNode) \|\| varTypeIsSIMD(treeNode);
1692	if (srcFltReg != dstFltReg)
1693	{
1694	instruction ins;
1695	regNumber fltReg;
1696	regNumber intReg;
1697	if (dstFltReg)
1698	{
1699	ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
1700	fltReg = treeNode->gtRegNum;
1701	intReg = op1->gtRegNum;
1702	}
1703	else
1704	{
1705	ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
1706	intReg = treeNode->gtRegNum;
1707	fltReg = op1->gtRegNum;
1708	}
1709	inst_RV_RV(ins, fltReg, intReg, treeNode->TypeGet());
1710	}
1711	else if (treeNode->gtRegNum != op1->gtRegNum)
1712	{
1713	inst_RV_RV(ins_Copy(treeNode->TypeGet()), treeNode->gtRegNum, op1->gtRegNum, treeNode->TypeGet());
1714	}
1715
1716	genProduceReg(treeNode);
1717	break;
1718	}
1719
1720	case GT_LCL_FLD_ADDR:
1721	case GT_LCL_VAR_ADDR:
1722	genCodeForLclAddr(treeNode);
1723	break;
1724
1725	case GT_LCL_FLD:
1726	genCodeForLclFld(treeNode->AsLclFld());
1727	break;
1728
1729	case GT_LCL_VAR:
1730	genCodeForLclVar(treeNode->AsLclVar());
1731	break;
1732
1733	case GT_STORE_LCL_FLD:
1734	genCodeForStoreLclFld(treeNode->AsLclFld());
1735	break;
1736
1737	case GT_STORE_LCL_VAR:
1738	genCodeForStoreLclVar(treeNode->AsLclVar());
1739	break;
1740
1741	case GT_RETFILT:
1742	case GT_RETURN:
1743	genReturn(treeNode);
1744	break;
1745
1746	case GT_LEA:
1747	// If we are here, it is the case where there is an LEA that cannot be folded into a parent instruction.
1748	genLeaInstruction(treeNode->AsAddrMode());
1749	break;
1750
1751	case GT_INDEX_ADDR:
1752	genCodeForIndexAddr(treeNode->AsIndexAddr());
1753	break;
1754
1755	case GT_IND:
1756	genCodeForIndir(treeNode->AsIndir());
1757	break;
1758
1759	case GT_MULHI:
1760	#ifdef _TARGET_X86_
1761	case GT_MUL_LONG:
1762	#endif
1763	genCodeForMulHi(treeNode->AsOp());
1764	break;
1765
1766	case GT_INTRINSIC:
1767	genIntrinsic(treeNode);
1768	break;
1769
1770	#ifdef FEATURE_SIMD
1771	case GT_SIMD:
1772	genSIMDIntrinsic(treeNode->AsSIMD());
1773	break;
1774	#endif // FEATURE_SIMD
1775
1776	#ifdef FEATURE_HW_INTRINSICS
1777	case GT_HWIntrinsic:
1778	genHWIntrinsic(treeNode->AsHWIntrinsic());
1779	break;
1780	#endif // FEATURE_HW_INTRINSICS
1781
1782	case GT_CKFINITE:
1783	genCkfinite(treeNode);
1784	break;
1785
1786	case GT_EQ:
1787	case GT_NE:
1788	case GT_LT:
1789	case GT_LE:
1790	case GT_GE:
1791	case GT_GT:
1792	case GT_TEST_EQ:
1793	case GT_TEST_NE:
1794	case GT_CMP:
1795	genCodeForCompare(treeNode->AsOp());
1796	break;
1797
1798	case GT_JTRUE:
1799	genCodeForJumpTrue(treeNode);
1800	break;
1801
1802	case GT_JCC:
1803	genCodeForJcc(treeNode->AsCC());
1804	break;
1805
1806	case GT_SETCC:
1807	genCodeForSetcc(treeNode->AsCC());
1808	break;
1809
1810	case GT_BT:
1811	genCodeForBT(treeNode->AsOp());
1812	break;
1813
1814	case GT_RETURNTRAP:
1815	genCodeForReturnTrap(treeNode->AsOp());
1816	break;
1817
1818	case GT_STOREIND:
1819	genCodeForStoreInd(treeNode->AsStoreInd());
1820	break;
1821
1822	case GT_COPY:
1823	// This is handled at the time we call genConsumeReg() on the GT_COPY
1824	break;
1825
1826	case GT_LIST:
1827	case GT_FIELD_LIST:
1828	// Should always be marked contained.
1829	assert(!"LIST, FIELD_LIST nodes should always be marked contained.");
1830	break;
1831
1832	case GT_SWAP:
1833	genCodeForSwap(treeNode->AsOp());
1834	break;
1835
1836	case GT_PUTARG_STK:
1837	genPutArgStk(treeNode->AsPutArgStk());
1838	break;
1839
1840	case GT_PUTARG_REG:
1841	genPutArgReg(treeNode->AsOp());
1842	break;
1843
1844	case GT_CALL:
1845	genCallInstruction(treeNode->AsCall());
1846	break;
1847
1848	case GT_JMP:
1849	genJmpMethod(treeNode);
1850	break;
1851
1852	case GT_LOCKADD:
1853	genCodeForLockAdd(treeNode->AsOp());
1854	break;
1855
1856	case GT_XCHG:
1857	case GT_XADD:
1858	genLockedInstructions(treeNode->AsOp());
1859	break;
1860
1861	case GT_MEMORYBARRIER:
1862	instGen_MemoryBarrier();
1863	break;
1864
1865	case GT_CMPXCHG:
1866	genCodeForCmpXchg(treeNode->AsCmpXchg());
1867	break;
1868
1869	case GT_RELOAD:
1870	// do nothing - reload is just a marker.
1871	// The parent node will call genConsumeReg on this which will trigger the unspill of this node's child
1872	// into the register specified in this node.
1873	break;
1874
1875	case GT_NOP:
1876	break;
1877
1878	case GT_NO_OP:
1879	getEmitter()->emitIns_Nop(`1`);
1880	break;
1881
1882	case GT_ARR_BOUNDS_CHECK:
1883	#ifdef FEATURE_SIMD
1884	case GT_SIMD_CHK:
1885	#endif // FEATURE_SIMD
1886	#ifdef FEATURE_HW_INTRINSICS
1887	case GT_HW_INTRINSIC_CHK:
1888	#endif // FEATURE_HW_INTRINSICS
1889	genRangeCheck(treeNode);
1890	break;
1891
1892	case GT_PHYSREG:
1893	genCodeForPhysReg(treeNode->AsPhysReg());
1894	break;
1895
1896	case GT_NULLCHECK:
1897	genCodeForNullCheck(treeNode->AsOp());
1898	break;
1899
1900	case GT_CATCH_ARG:
1901
1902	noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp));
1903
1904	/ Catch arguments get passed in a register. genCodeForBBlist()*
1905	would have marked it as holding a GC object, but not used. /*
1906
1907	noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT);
1908	genConsumeReg(treeNode);
1909	break;
1910
1911	#if !FEATURE_EH_FUNCLETS
1912	case GT_END_LFIN:
1913
1914	// Have to clear the ShadowSP of the nesting level which encloses the finally. Generates:
1915	// mov dword ptr [ebp-0xC], 0 // for some slot of the ShadowSP local var
1916
1917	unsigned finallyNesting;
1918	finallyNesting = treeNode->gtVal.gtVal1;
1919	noway_assert(treeNode->gtVal.gtVal1 < compiler->compHndBBtabCount);
1920	noway_assert(finallyNesting < compiler->compHndBBtabCount);
1921
1922	// The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
1923	unsigned filterEndOffsetSlotOffs;
1924	PREFIX_ASSUME(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) >
1925	TARGET_POINTER_SIZE); // below doesn't underflow.
1926	filterEndOffsetSlotOffs =
1927	(unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
1928
1929	unsigned curNestingSlotOffs;
1930	curNestingSlotOffs = filterEndOffsetSlotOffs - ((finallyNesting + `1`) * TARGET_POINTER_SIZE);
1931	instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, `0`, compiler->lvaShadowSPslotsVar, curNestingSlotOffs);
1932	break;
1933	#endif // !FEATURE_EH_FUNCLETS
1934
1935	case GT_PINVOKE_PROLOG:
1936	noway_assert(((gcInfo.gcRegGCrefSetCur \| gcInfo.gcRegByrefSetCur) & ~fullIntArgRegMask()) == `0`);
1937
1938	// the runtime side requires the codegen here to be consistent
1939	emit->emitDisableRandomNops();
1940	break;
1941
1942	case GT_LABEL:
1943	genPendingCallLabel = genCreateTempLabel();
1944	treeNode->gtLabel.gtLabBB = genPendingCallLabel;
1945	emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, genPendingCallLabel, treeNode->gtRegNum);
1946	break;
1947
1948	case GT_STORE_OBJ:
1949	case GT_STORE_DYN_BLK:
1950	case GT_STORE_BLK:
1951	genCodeForStoreBlk(treeNode->AsBlk());
1952	break;
1953
1954	case GT_JMPTABLE:
1955	genJumpTable(treeNode);
1956	break;
1957
1958	case GT_SWITCH_TABLE:
1959	genTableBasedSwitch(treeNode);
1960	break;
1961
1962	case GT_ARR_INDEX:
1963	genCodeForArrIndex(treeNode->AsArrIndex());
1964	break;
1965
1966	case GT_ARR_OFFSET:
1967	genCodeForArrOffset(treeNode->AsArrOffs());
1968	break;
1969
1970	case GT_CLS_VAR_ADDR:
1971	emit->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, `0`);
1972	genProduceReg(treeNode);
1973	break;
1974
1975	#if !defined(_TARGET_64BIT_)
1976	case GT_LONG:
1977	assert(treeNode->isUsedFromReg());
1978	genConsumeRegs(treeNode);
1979	break;
1980	#endif
1981
1982	case GT_IL_OFFSET:
1983	// Do nothing; these nodes are simply markers for debug info.
1984	break;
1985
1986	default:
1987	{
1988	#ifdef DEBUG
1989	char message[`256`];
1990	_snprintf_s(message, _countof(message), _TRUNCATE, "NYI: Unimplemented node type %s\n",
1991	GenTree::OpName(treeNode->OperGet()));
1992	NYIRAW(message);
1993	#endif
1994	assert(!"Unknown node in codegen");
1995	}
1996	break;
1997	}
1998	}
1999
2000	//----------------------------------------------------------------------------------
2001	// genMultiRegCallStoreToLocal: store multi-reg return value of a call node to a local
2002	//
2003	// Arguments:
2004	// treeNode - Gentree of GT_STORE_LCL_VAR
2005	//
2006	// Return Value:
2007	// None
2008	//
2009	// Assumption:
2010	// The child of store is a multi-reg call node.
2011	// genProduceReg() on treeNode is made by caller of this routine.
2012	//
2013	void CodeGen::genMultiRegCallStoreToLocal(GenTree* treeNode)
2014	{
2015	assert(treeNode->OperGet() == GT_STORE_LCL_VAR);
2016
2017	#ifdef UNIX_AMD64_ABI
2018	// Structs of size >=9 and <=16 are returned in two return registers on x64 Unix.
2019	assert(varTypeIsStruct(treeNode));
2020
2021	// Assumption: current x64 Unix implementation requires that a multi-reg struct
2022	// var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
2023	// being struct promoted.
2024	unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
2025	LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
2026	noway_assert(varDsc->lvIsMultiRegRet);
2027
2028	GenTree* op1 = treeNode->gtGetOp1();
2029	GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
2030	GenTreeCall* call = actualOp1->AsCall();
2031	assert(call->HasMultiRegRetVal());
2032
2033	genConsumeRegs(op1);
2034
2035	ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
2036	assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
2037	unsigned regCount = retTypeDesc->GetReturnRegCount();
2038
2039	if (treeNode->gtRegNum != REG_NA)
2040	{
2041	// Right now the only enregistrable structs supported are SIMD types.
2042	assert(varTypeIsSIMD(treeNode));
2043	assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(`0`)));
2044	assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(`1`)));
2045
2046	// This is a case of two 8-bytes that comprise the operand is in
2047	// two different xmm registers and needs to assembled into a single
2048	// xmm register.
2049	regNumber targetReg = treeNode->gtRegNum;
2050	regNumber reg0 = call->GetRegNumByIdx(`0`);
2051	regNumber reg1 = call->GetRegNumByIdx(`1`);
2052
2053	if (op1->IsCopyOrReload())
2054	{
2055	// GT_COPY/GT_RELOAD will have valid reg for those positions
2056	// that need to be copied or reloaded.
2057	regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(`0`);
2058	if (reloadReg != REG_NA)
2059	{
2060	reg0 = reloadReg;
2061	}
2062
2063	reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(`1`);
2064	if (reloadReg != REG_NA)
2065	{
2066	reg1 = reloadReg;
2067	}
2068	}
2069
2070	if (targetReg != reg0 && targetReg != reg1)
2071	{
2072	// Copy reg0 into targetReg and let it to be handled by one
2073	// of the cases below.
2074	inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
2075	targetReg = reg0;
2076	}
2077
2078	if (targetReg == reg0)
2079	{
2080	// targeReg[63:0] = targetReg[63:0]
2081	// targetReg[127:64] = reg1[127:64]
2082	inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, `0x00`);
2083	}
2084	else
2085	{
2086	assert(targetReg == reg1);
2087
2088	// We need two shuffles to achieve this
2089	// First:
2090	// targeReg[63:0] = targetReg[63:0]
2091	// targetReg[127:64] = reg0[63:0]
2092	//
2093	// Second:
2094	// targeReg[63:0] = targetReg[127:64]
2095	// targetReg[127:64] = targetReg[63:0]
2096	//
2097	// Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
2098	// and next swap low and high 8-bytes of targetReg to have them
2099	// rearranged in the right order.
2100	inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, `0x00`);
2101	inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, `0x01`);
2102	}
2103	}
2104	else
2105	{
2106	// Stack store
2107	int offset = `0`;
2108	for (unsigned i = `0`; i < regCount; ++i)
2109	{
2110	var_types type = retTypeDesc->GetReturnRegType(i);
2111	regNumber reg = call->GetRegNumByIdx(i);
2112	if (op1->IsCopyOrReload())
2113	{
2114	// GT_COPY/GT_RELOAD will have valid reg for those positions
2115	// that need to be copied or reloaded.
2116	regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
2117	if (reloadReg != REG_NA)
2118	{
2119	reg = reloadReg;
2120	}
2121	}
2122
2123	assert(reg != REG_NA);
2124	getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
2125	offset += genTypeSize(type);
2126	}
2127
2128	varDsc->lvRegNum = REG_STK;
2129	}
2130	#elif defined(_TARGET_X86_)
2131	// Longs are returned in two return registers on x86.
2132	assert(varTypeIsLong(treeNode));
2133
2134	// Assumption: current x86 implementation requires that a multi-reg long
2135	// var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
2136	// being promoted.
2137	unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
2138	LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
2139	noway_assert(varDsc->lvIsMultiRegRet);
2140
2141	GenTree* op1 = treeNode->gtGetOp1();
2142	GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
2143	GenTreeCall* call = actualOp1->AsCall();
2144	assert(call->HasMultiRegRetVal());
2145
2146	genConsumeRegs(op1);
2147
2148	ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
2149	unsigned regCount = retTypeDesc->GetReturnRegCount();
2150	assert(regCount == MAX_RET_REG_COUNT);
2151
2152	// Stack store
2153	int offset = `0`;
2154	for (unsigned i = `0`; i < regCount; ++i)
2155	{
2156	var_types type = retTypeDesc->GetReturnRegType(i);
2157	regNumber reg = call->GetRegNumByIdx(i);
2158	if (op1->IsCopyOrReload())
2159	{
2160	// GT_COPY/GT_RELOAD will have valid reg for those positions
2161	// that need to be copied or reloaded.
2162	regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
2163	if (reloadReg != REG_NA)
2164	{
2165	reg = reloadReg;
2166	}
2167	}
2168
2169	assert(reg != REG_NA);
2170	getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
2171	offset += genTypeSize(type);
2172	}
2173
2174	varDsc->lvRegNum = REG_STK;
2175	#else // !UNIX_AMD64_ABI && !_TARGET_X86_
2176	assert(!"Unreached");
2177	#endif // !UNIX_AMD64_ABI && !_TARGET_X86_
2178	}
2179
2180	//------------------------------------------------------------------------
2181	// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP.
2182	//
2183	void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
2184	{
2185	assert(compiler->compGeneratingProlog);
2186
2187	if (frameSize == `0`)
2188	{
2189	return;
2190	}
2191
2192	const target_size_t pageSize = compiler->eeGetPageSize();
2193
2194	if (frameSize == REGSIZE_BYTES)
2195	{
2196	// Frame size is the same as register size.
2197	inst_RV(INS_push, REG_EAX, TYP_I_IMPL);
2198	}
2199	else if (frameSize < pageSize)
2200	{
2201	// Frame size is (0x0008..0x1000)
2202	inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2203	}
2204	else if (frameSize < compiler->getVeryLargeFrameSize())
2205	{
2206	// Frame size is (0x1000..0x3000)
2207
2208	getEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, -(int)pageSize);
2209
2210	if (frameSize >= `0x2000`)
2211	{
2212	getEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, -`2` * (int)pageSize);
2213	}
2214
2215	inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2216	}
2217	else
2218	{
2219	// Frame size >= 0x3000
2220	assert(frameSize >= compiler->getVeryLargeFrameSize());
2221
2222	// Emit the following sequence to 'tickle' the pages.
2223	// Note it is important that stack pointer not change until this is
2224	// complete since the tickles could cause a stack overflow, and we
2225	// need to be able to crawl the stack afterward (which means the
2226	// stack pointer needs to be known).
2227
2228	bool pushedStubParam = false;
2229	if (compiler->info.compPublishStubParam && (REG_SECRET_STUB_PARAM == initReg))
2230	{
2231	// push register containing the StubParam
2232	inst_RV(INS_push, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
2233	pushedStubParam = true;
2234	}
2235
2236	#ifndef _TARGET_UNIX_
2237	instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
2238	#endif
2239
2240	//
2241	// Can't have a label inside the ReJIT padding area
2242	//
2243	genPrologPadForReJit();
2244
2245	#ifndef _TARGET_UNIX_
2246	// Code size for each instruction. We need this because the
2247	// backward branch is hard-coded with the number of bytes to branch.
2248	// The encoding differs based on the architecture and what register is
2249	// used (namely, using RAX has a smaller encoding).
2250	//
2251	// loop:
2252	// For x86
2253	// test [esp + eax], eax 3
2254	// sub eax, 0x1000 5
2255	// cmp EAX, -frameSize 5
2256	// jge loop 2
2257	//
2258	// For AMD64 using RAX
2259	// test [rsp + rax], rax 4
2260	// sub rax, 0x1000 6
2261	// cmp rax, -frameSize 6
2262	// jge loop 2
2263	//
2264	// For AMD64 using RBP
2265	// test [rsp + rbp], rbp 4
2266	// sub rbp, 0x1000 7
2267	// cmp rbp, -frameSize 7
2268	// jge loop 2
2269
2270	getEmitter()->emitIns_R_ARR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, initReg, `0`);
2271	inst_RV_IV(INS_sub, initReg, pageSize, EA_PTRSIZE);
2272	inst_RV_IV(INS_cmp, initReg, -((ssize_t)frameSize), EA_PTRSIZE);
2273
2274	int bytesForBackwardJump;
2275	#ifdef _TARGET_AMD64_
2276	assert((initReg == REG_EAX) \|\| (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
2277	bytesForBackwardJump = ((initReg == REG_EAX) ? -`18` : -`20`);
2278	#else // !_TARGET_AMD64_
2279	assert(initReg == REG_EAX);
2280	bytesForBackwardJump = -`15`;
2281	#endif // !_TARGET_AMD64_
2282
2283	// Branch backwards to start of loop
2284	inst_IV(INS_jge, bytesForBackwardJump);
2285	#else // _TARGET_UNIX_
2286	// Code size for each instruction. We need this because the
2287	// backward branch is hard-coded with the number of bytes to branch.
2288	// The encoding differs based on the architecture and what register is
2289	// used (namely, using RAX has a smaller encoding).
2290	//
2291	// For x86
2292	// lea eax, [esp - frameSize]
2293	// loop:
2294	// lea esp, [esp - pageSize] 7
2295	// test [esp], eax 3
2296	// cmp esp, eax 2
2297	// jge loop 2
2298	// lea rsp, [rbp + frameSize]
2299	//
2300	// For AMD64 using RAX
2301	// lea rax, [rsp - frameSize]
2302	// loop:
2303	// lea rsp, [rsp - pageSize] 8
2304	// test [rsp], rax 4
2305	// cmp rsp, rax 3
2306	// jge loop 2
2307	// lea rsp, [rax + frameSize]
2308	//
2309	// For AMD64 using RBP
2310	// lea rbp, [rsp - frameSize]
2311	// loop:
2312	// lea rsp, [rsp - pageSize] 8
2313	// test [rsp], rbp 4
2314	// cmp rsp, rbp 3
2315	// jge loop 2
2316	// lea rsp, [rbp + frameSize]
2317
2318	int sPageSize = (int)pageSize;
2319
2320	getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, initReg, REG_SPBASE, -((ssize_t)frameSize)); // get frame border
2321
2322	getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -sPageSize);
2323	getEmitter()->emitIns_R_AR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, `0`);
2324	inst_RV_RV(INS_cmp, REG_SPBASE, initReg);
2325
2326	int bytesForBackwardJump;
2327	#ifdef _TARGET_AMD64_
2328	assert((initReg == REG_EAX) \|\| (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
2329	bytesForBackwardJump = -`17`;
2330	#else // !_TARGET_AMD64_
2331	assert(initReg == REG_EAX);
2332	bytesForBackwardJump = -`14`;
2333	#endif // !_TARGET_AMD64_
2334
2335	inst_IV(INS_jge, bytesForBackwardJump); // Branch backwards to start of loop
2336
2337	getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, initReg, frameSize); // restore stack pointer
2338	#endif // _TARGET_UNIX_
2339
2340	pInitRegZeroed = false; // The initReg does not contain zero*
2341
2342	if (pushedStubParam)
2343	{
2344	// pop eax
2345	inst_RV(INS_pop, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
2346	regSet.verifyRegUsed(REG_SECRET_STUB_PARAM);
2347	}
2348
2349	// sub esp, frameSize 6
2350	inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
2351	}
2352
2353	compiler->unwindAllocStack(frameSize);
2354
2355	if (!doubleAlignOrFramePointerUsed())
2356	{
2357	psiAdjustStackLevel(frameSize);
2358	}
2359	}
2360
2361	//------------------------------------------------------------------------
2362	// genLclHeap: Generate code for localloc.
2363	//
2364	// Arguments:
2365	// tree - the localloc tree to generate.
2366	//
2367	// Notes:
2368	// Note that for x86, we don't track ESP movements while generating the localloc code.
2369	// The ESP tracking is used to report stack pointer-relative GC info, which is not
2370	// interesting while doing the localloc construction. Also, for functions with localloc,
2371	// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
2372	// call arguments.
2373	//
2374	// For x86, we store the ESP after the localloc is complete in the LocAllocSP
2375	// variable. This variable is implicitly reported to the VM in the GC info (its position
2376	// is defined by convention relative to other items), and is used by the GC to find the
2377	// "base" stack pointer in functions with localloc.
2378	//
2379	void CodeGen::genLclHeap(GenTree* tree)
2380	{
2381	assert(tree->OperGet() == GT_LCLHEAP);
2382	assert(compiler->compLocallocUsed);
2383
2384	GenTree* size = tree->gtOp.gtOp1;
2385	noway_assert((genActualType(size->gtType) == TYP_INT) \|\| (genActualType(size->gtType) == TYP_I_IMPL));
2386
2387	regNumber targetReg = tree->gtRegNum;
2388	regNumber regCnt = REG_NA;
2389	var_types type = genActualType(size->gtType);
2390	emitAttr easz = emitTypeSize(type);
2391	BasicBlock* endLabel = nullptr;
2392
2393	#ifdef DEBUG
2394	genStackPointerCheck(compiler->opts.compStackCheckOnRet, compiler->lvaReturnSpCheck);
2395	#endif
2396
2397	noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
2398	noway_assert(genStackLevel == `0`); // Can't have anything on the stack
2399
2400	unsigned stackAdjustment = `0`;
2401	BasicBlock* loop = nullptr;
2402
2403	// compute the amount of memory to allocate to properly STACK_ALIGN.
2404	size_t amount = `0`;
2405	if (size->IsCnsIntOrI())
2406	{
2407	// If size is a constant, then it must be contained.
2408	assert(size->isContained());
2409
2410	// If amount is zero then return null in targetReg
2411	amount = size->gtIntCon.gtIconVal;
2412	if (amount == `0`)
2413	{
2414	instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
2415	goto BAILOUT;
2416	}
2417
2418	// 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
2419	amount = AlignUp(amount, STACK_ALIGN);
2420	}
2421	else
2422	{
2423	// The localloc requested memory size is non-constant.
2424
2425	// Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
2426	genConsumeRegAndCopy(size, targetReg);
2427	endLabel = genCreateTempLabel();
2428	getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
2429	inst_JMP(EJ_je, endLabel);
2430
2431	// Compute the size of the block to allocate and perform alignment.
2432	// If compInitMem=true, we can reuse targetReg as regcnt,
2433	// since we don't need any internal registers.
2434	if (compiler->info.compInitMem)
2435	{
2436	assert(tree->AvailableTempRegCount() == `0`);
2437	regCnt = targetReg;
2438	}
2439	else
2440	{
2441	regCnt = tree->ExtractTempReg();
2442	if (regCnt != targetReg)
2443	{
2444	// Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
2445	inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
2446	}
2447	}
2448
2449	// Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
2450	// by code like:
2451	// add reg, 15
2452	// and reg, -16
2453	// However, in the initialized memory case, we need the count of STACK_ALIGN-sized
2454	// elements, not a byte count, after the alignment. So instead of the "and", which
2455	// becomes unnecessary, generate a shift, e.g.:
2456	// add reg, 15
2457	// shr reg, 4
2458
2459	inst_RV_IV(INS_add, regCnt, STACK_ALIGN - `1`, emitActualTypeSize(type));
2460
2461	if (compiler->info.compInitMem)
2462	{
2463	// Convert the count from a count of bytes to a loop count. We will loop once per
2464	// stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes
2465	// on x64 and Linux/x86.
2466	//
2467	// Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
2468	// words per iteration on x64. We will shift off all the stack alignment bits
2469	// added above, so there is no need for an 'and' instruction.
2470
2471	// --- shr regCnt, 2 (or 4) ---
2472	inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
2473	}
2474	else
2475	{
2476	// Otherwise, mask off the low bits to align the byte count.
2477	inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - `1`), emitActualTypeSize(type));
2478	}
2479	}
2480
2481	#if FEATURE_FIXED_OUT_ARGS
2482	// If we have an outgoing arg area then we must adjust the SP by popping off the
2483	// outgoing arg area. We will restore it right before we return from this method.
2484	//
2485	// Localloc returns stack space that aligned to STACK_ALIGN bytes. The following
2486	// are the cases that need to be handled:
2487	// i) Method has out-going arg area.
2488	// It is guaranteed that size of out-going arg area is STACK_ALIGN'ed (see fgMorphArgs).
2489	// Therefore, we will pop off the out-going arg area from RSP before allocating the localloc space.
2490	// ii) Method has no out-going arg area.
2491	// Nothing to pop off from the stack.
2492	if (compiler->lvaOutgoingArgSpaceSize > `0`)
2493	{
2494	assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == `0`); // This must be true for the stack to remain
2495	// aligned
2496	inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
2497	stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
2498	}
2499	#endif
2500
2501	if (size->IsCnsIntOrI())
2502	{
2503	// We should reach here only for non-zero, constant size allocations.
2504	assert(amount > `0`);
2505	assert((amount % STACK_ALIGN) == `0`);
2506	assert((amount % REGSIZE_BYTES) == `0`);
2507
2508	// For small allocations we will generate up to six push 0 inline
2509	size_t cntRegSizedWords = amount / REGSIZE_BYTES;
2510	if (cntRegSizedWords <= `6`)
2511	{
2512	for (; cntRegSizedWords != `0`; cntRegSizedWords--)
2513	{
2514	inst_IV(INS_push_hide, `0`); // push_hide means don't track the stack
2515	}
2516	goto ALLOC_DONE;
2517	}
2518
2519	bool doNoInitLessThanOnePageAlloc =
2520	!compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <=
2521
2522	#ifdef _TARGET_X86_
2523	bool needRegCntRegister = true;
2524	#else // !_TARGET_X86_
2525	bool needRegCntRegister = !doNoInitLessThanOnePageAlloc;
2526	#endif // !_TARGET_X86_
2527
2528	if (needRegCntRegister)
2529	{
2530	// If compInitMem=true, we can reuse targetReg as regcnt.
2531	// Since size is a constant, regCnt is not yet initialized.
2532	assert(regCnt == REG_NA);
2533	if (compiler->info.compInitMem)
2534	{
2535	assert(tree->AvailableTempRegCount() == `0`);
2536	regCnt = targetReg;
2537	}
2538	else
2539	{
2540	regCnt = tree->ExtractTempReg();
2541	}
2542	}
2543
2544	if (doNoInitLessThanOnePageAlloc)
2545	{
2546	// Since the size is less than a page, simply adjust ESP.
2547	// ESP might already be in the guard page, so we must touch it BEFORE
2548	// the alloc, not after.
2549	CLANG_FORMAT_COMMENT_ANCHOR;
2550
2551	#ifdef _TARGET_X86_
2552	// For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment
2553	// to ESP. So do the work in the count register.
2554	// TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
2555	// creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
2556	// track".
2557	inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL);
2558	getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, `0`);
2559	inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE);
2560	inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL);
2561	#else // !_TARGET_X86_
2562	getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, `0`);
2563	inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE);
2564	#endif // !_TARGET_X86_
2565
2566	goto ALLOC_DONE;
2567	}
2568
2569	// else, "mov regCnt, amount"
2570
2571	if (compiler->info.compInitMem)
2572	{
2573	// When initializing memory, we want 'amount' to be the loop count.
2574	assert((amount % STACK_ALIGN) == `0`);
2575	amount /= STACK_ALIGN;
2576	}
2577
2578	genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
2579	}
2580
2581	loop = genCreateTempLabel();
2582	if (compiler->info.compInitMem)
2583	{
2584	// At this point 'regCnt' is set to the number of loop iterations for this loop, if each
2585	// iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
2586	// Since we have to zero out the allocated memory AND ensure that RSP is always valid
2587	// by tickling the pages, we will just push 0's on the stack.
2588
2589	assert(genIsValidIntReg(regCnt));
2590
2591	// Loop:
2592	genDefineTempLabel(loop);
2593
2594	static_assert_no_msg((STACK_ALIGN % REGSIZE_BYTES) == `0`);
2595	unsigned const count = (STACK_ALIGN / REGSIZE_BYTES);
2596
2597	for (unsigned i = `0`; i < count; i++)
2598	{
2599	inst_IV(INS_push_hide, `0`); // --- push REG_SIZE bytes of 0
2600	}
2601	// Note that the stack must always be aligned to STACK_ALIGN bytes
2602
2603	// Decrement the loop counter and loop if not done.
2604	inst_RV(INS_dec, regCnt, TYP_I_IMPL);
2605	inst_JMP(EJ_jne, loop);
2606	}
2607	else
2608	{
2609	// At this point 'regCnt' is set to the total number of bytes to localloc.
2610	//
2611	// We don't need to zero out the allocated memory. However, we do have
2612	// to tickle the pages to ensure that ESP is always valid and is
2613	// in sync with the "stack guard page". Note that in the worst
2614	// case ESP is on the last byte of the guard page. Thus you must
2615	// touch ESP+0 first not ESP+x01000.
2616	//
2617	// Another subtlety is that you don't want ESP to be exactly on the
2618	// boundary of the guard page because PUSH is predecrement, thus
2619	// call setup would not touch the guard page but just beyond it
2620	//
2621	// Note that we go through a few hoops so that ESP never points to
2622	// illegal pages at any time during the tickling process
2623	//
2624	// neg REGCNT
2625	// add REGCNT, ESP // reg now holds ultimate ESP
2626	// jb loop // result is smaller than orignial ESP (no wrap around)
2627	// xor REGCNT, REGCNT, // Overflow, pick lowest possible number
2628	// loop:
2629	// test ESP, [ESP+0] // tickle the page
2630	// mov REGTMP, ESP
2631	// sub REGTMP, GetOsPageSize()
2632	// mov ESP, REGTMP
2633	// cmp ESP, REGCNT
2634	// jae loop
2635	//
2636	// mov ESP, REG
2637	// end:
2638	inst_RV(INS_NEG, regCnt, TYP_I_IMPL);
2639	inst_RV_RV(INS_add, regCnt, REG_SPBASE, TYP_I_IMPL);
2640	inst_JMP(EJ_jb, loop);
2641
2642	instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
2643
2644	genDefineTempLabel(loop);
2645
2646	// Tickle the decremented value, and move back to ESP,
2647	// note that it has to be done BEFORE the update of ESP since
2648	// ESP might already be on the guard page. It is OK to leave
2649	// the final value of ESP on the guard page
2650	getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, `0`);
2651
2652	// This is a harmless trick to avoid the emitter trying to track the
2653	// decrement of the ESP - we do the subtraction in another reg instead
2654	// of adjusting ESP directly.
2655	regNumber regTmp = tree->GetSingleTempReg();
2656
2657	inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
2658	inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
2659	inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
2660
2661	inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL);
2662	inst_JMP(EJ_jae, loop);
2663
2664	// Move the final value to ESP
2665	inst_RV_RV(INS_mov, REG_SPBASE, regCnt);
2666	}
2667
2668	ALLOC_DONE:
2669	// Re-adjust SP to allocate out-going arg area
2670	if (stackAdjustment > `0`)
2671	{
2672	assert((stackAdjustment % STACK_ALIGN) == `0`); // This must be true for the stack to remain aligned
2673	inst_RV_IV(INS_sub, REG_SPBASE, stackAdjustment, EA_PTRSIZE);
2674	}
2675
2676	// Return the stackalloc'ed address in result register.
2677	// TargetReg = RSP + stackAdjustment.
2678	getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, targetReg, REG_SPBASE, stackAdjustment);
2679
2680	if (endLabel != nullptr)
2681	{
2682	genDefineTempLabel(endLabel);
2683	}
2684
2685	BAILOUT:
2686
2687	#ifdef JIT32_GCENCODER
2688	if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
2689	{
2690	getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, `0`);
2691	}
2692	#endif // JIT32_GCENCODER
2693
2694	#if STACK_PROBES
2695	if (compiler->opts.compNeedStackProbes)
2696	{
2697	genGenerateStackProbe();
2698	}
2699	#endif
2700
2701	#ifdef DEBUG
2702	// Update local variable to reflect the new stack pointer.
2703	if (compiler->opts.compStackCheckOnRet)
2704	{
2705	noway_assert(compiler->lvaReturnSpCheck != `0xCCCCCCCC` &&
2706	compiler->lvaTable[compiler->lvaReturnSpCheck].lvDoNotEnregister &&
2707	compiler->lvaTable[compiler->lvaReturnSpCheck].lvOnFrame);
2708	getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnSpCheck, `0`);
2709	}
2710	#endif
2711
2712	genProduceReg(tree);
2713	}
2714
2715	void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
2716	{
2717	assert(storeBlkNode->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
2718
2719	if (storeBlkNode->OperIs(GT_STORE_OBJ) && storeBlkNode->OperIsCopyBlkOp() && !storeBlkNode->gtBlkOpGcUnsafe)
2720	{
2721	assert(storeBlkNode->AsObj()->gtGcPtrCount != `0`);
2722	genCodeForCpObj(storeBlkNode->AsObj());
2723	return;
2724	}
2725
2726	#ifdef JIT32_GCENCODER
2727	assert(!storeBlkNode->gtBlkOpGcUnsafe);
2728	#else
2729	if (storeBlkNode->gtBlkOpGcUnsafe)
2730	{
2731	getEmitter()->emitDisableGC();
2732	}
2733	#endif // JIT32_GCENCODER
2734
2735	bool isCopyBlk = storeBlkNode->OperIsCopyBlkOp();
2736
2737	switch (storeBlkNode->gtBlkOpKind)
2738	{
2739	#ifdef _TARGET_AMD64_
2740	case GenTreeBlk::BlkOpKindHelper:
2741	if (isCopyBlk)
2742	{
2743	genCodeForCpBlk(storeBlkNode);
2744	}
2745	else
2746	{
2747	genCodeForInitBlk(storeBlkNode);
2748	}
2749	break;
2750	#endif // _TARGET_AMD64_
2751	case GenTreeBlk::BlkOpKindRepInstr:
2752	if (isCopyBlk)
2753	{
2754	genCodeForCpBlkRepMovs(storeBlkNode);
2755	}
2756	else
2757	{
2758	genCodeForInitBlkRepStos(storeBlkNode);
2759	}
2760	break;
2761	case GenTreeBlk::BlkOpKindUnroll:
2762	if (isCopyBlk)
2763	{
2764	genCodeForCpBlkUnroll(storeBlkNode);
2765	}
2766	else
2767	{
2768	genCodeForInitBlkUnroll(storeBlkNode);
2769	}
2770	break;
2771	default:
2772	unreached();
2773	}
2774
2775	#ifndef JIT32_GCENCODER
2776	if (storeBlkNode->gtBlkOpGcUnsafe)
2777	{
2778	getEmitter()->emitEnableGC();
2779	}
2780	#endif // !defined(JIT32_GCENCODER)
2781	}
2782
2783	//
2784	//------------------------------------------------------------------------
2785	// genCodeForInitBlkRepStos: Generate code for InitBlk using rep stos.
2786	//
2787	// Arguments:
2788	// initBlkNode - The Block store for which we are generating code.
2789	//
2790	// Preconditions:
2791	// On x64:
2792	// The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
2793	// Any value larger than that, we'll use the helper even if both the fill byte and the
2794	// size are integer constants.
2795	// On x86:
2796	// The size must either be a non-constant or less than INITBLK_STOS_LIMIT bytes.
2797	//
2798	void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
2799	{
2800	// Make sure we got the arguments of the initblk/initobj operation in the right registers.
2801	unsigned size = initBlkNode->Size();
2802	GenTree* dstAddr = initBlkNode->Addr();
2803	GenTree* initVal = initBlkNode->Data();
2804	if (initVal->OperIsInitVal())
2805	{
2806	initVal = initVal->gtGetOp1();
2807	}
2808
2809	#ifdef DEBUG
2810	assert(dstAddr->isUsedFromReg());
2811	assert(initVal->isUsedFromReg());
2812	#ifdef _TARGET_AMD64_
2813	assert(size != `0`);
2814	#endif
2815	if (initVal->IsCnsIntOrI())
2816	{
2817	#ifdef _TARGET_AMD64_
2818	assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
2819	#else
2820	// Note that a size of zero means a non-constant size.
2821	assert((size == `0`) \|\| (size > CPBLK_UNROLL_LIMIT));
2822	#endif
2823	}
2824
2825	#endif // DEBUG
2826
2827	genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
2828	instGen(INS_r_stosb);
2829	}
2830
2831	// Generate code for InitBlk by performing a loop unroll
2832	// Preconditions:
2833	// a) Both the size and fill byte value are integer constants.
2834	// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
2835	//
2836	void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
2837	{
2838	// Make sure we got the arguments of the initblk/initobj operation in the right registers
2839	unsigned size = initBlkNode->Size();
2840	GenTree* dstAddr = initBlkNode->Addr();
2841	GenTree* initVal = initBlkNode->Data();
2842	if (initVal->OperIsInitVal())
2843	{
2844	initVal = initVal->gtGetOp1();
2845	}
2846
2847	assert(dstAddr->isUsedFromReg());
2848	assert(initVal->isUsedFromReg() \|\| (initVal->IsIntegralConst(`0`) && ((size & `0xf`) == `0`)));
2849	assert(size != `0`);
2850	assert(size <= INITBLK_UNROLL_LIMIT);
2851	assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI());
2852
2853	emitter* emit = getEmitter();
2854
2855	genConsumeOperands(initBlkNode);
2856
2857	// If the initVal was moved, or spilled and reloaded to a different register,
2858	// get the original initVal from below the GT_RELOAD, but only after capturing the valReg,
2859	// which needs to be the new register.
2860	regNumber valReg = initVal->gtRegNum;
2861	initVal = initVal->gtSkipReloadOrCopy();
2862
2863	unsigned offset = `0`;
2864
2865	// Perform an unroll using SSE2 loads and stores.
2866	if (size >= XMM_REGSIZE_BYTES)
2867	{
2868	regNumber tmpReg = initBlkNode->GetSingleTempReg();
2869	assert(genIsValidFloatReg(tmpReg));
2870
2871	if (initVal->gtIntCon.gtIconVal != `0`)
2872	{
2873	emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, tmpReg, valReg);
2874	emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
2875	#ifdef _TARGET_X86_
2876	// For x86, we need one more to convert it from 8 bytes to 16 bytes.
2877	emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
2878	#endif // _TARGET_X86_
2879	}
2880	else
2881	{
2882	emit->emitIns_R_R(INS_xorps, EA_8BYTE, tmpReg, tmpReg);
2883	}
2884
2885	// Determine how many 16 byte slots we're going to fill using SSE movs.
2886	size_t slots = size / XMM_REGSIZE_BYTES;
2887
2888	while (slots-- > `0`)
2889	{
2890	emit->emitIns_AR_R(INS_movdqu, EA_8BYTE, tmpReg, dstAddr->gtRegNum, offset);
2891	offset += XMM_REGSIZE_BYTES;
2892	}
2893	}
2894
2895	// Fill the remainder (or a < 16 byte sized struct)
2896	if ((size & `8`) != `0`)
2897	{
2898	#ifdef _TARGET_X86_
2899	// TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
2900	emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2901	offset += `4`;
2902	emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2903	offset += `4`;
2904	#else // !_TARGET_X86_
2905
2906	emit->emitIns_AR_R(INS_mov, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
2907	offset += `8`;
2908
2909	#endif // !_TARGET_X86_
2910	}
2911	if ((size & `4`) != `0`)
2912	{
2913	emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
2914	offset += `4`;
2915	}
2916	if ((size & `2`) != `0`)
2917	{
2918	emit->emitIns_AR_R(INS_mov, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
2919	offset += `2`;
2920	}
2921	if ((size & `1`) != `0`)
2922	{
2923	emit->emitIns_AR_R(INS_mov, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
2924	}
2925	}
2926
2927	// Generates code for InitBlk by calling the VM memset helper function.
2928	// Preconditions:
2929	// a) The size argument of the InitBlk is not an integer constant.
2930	// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes.
2931	void CodeGen::genCodeForInitBlk(GenTreeBlk* initBlkNode)
2932	{
2933	#ifdef _TARGET_AMD64_
2934	// Make sure we got the arguments of the initblk operation in the right registers
2935	unsigned blockSize = initBlkNode->Size();
2936	GenTree* dstAddr = initBlkNode->Addr();
2937	GenTree* initVal = initBlkNode->Data();
2938	if (initVal->OperIsInitVal())
2939	{
2940	initVal = initVal->gtGetOp1();
2941	}
2942
2943	assert(dstAddr->isUsedFromReg());
2944	assert(initVal->isUsedFromReg());
2945
2946	if (blockSize != `0`)
2947	{
2948	assert(blockSize >= CPBLK_MOVS_LIMIT);
2949	}
2950
2951	genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
2952
2953	genEmitHelperCall(CORINFO_HELP_MEMSET, `0`, EA_UNKNOWN);
2954	#else // !_TARGET_AMD64_
2955	NYI_X86("Helper call for InitBlk");
2956	#endif // !_TARGET_AMD64_
2957	}
2958
2959	// Generate code for a load from some address + offset
2960	// baseNode: tree node which can be either a local address or arbitrary node
2961	// offset: distance from the baseNode from which to load
2962	void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
2963	{
2964	emitter* emit = getEmitter();
2965
2966	if (baseNode->OperIsLocalAddr())
2967	{
2968	if (baseNode->gtOper == GT_LCL_FLD_ADDR)
2969	{
2970	offset += baseNode->gtLclFld.gtLclOffs;
2971	}
2972	emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
2973	}
2974	else
2975	{
2976	emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
2977	}
2978	}
2979
2980	//------------------------------------------------------------------------
2981	// genCodeForStoreOffset: Generate code to store a reg to [base + offset].
2982	//
2983	// Arguments:
2984	// ins - the instruction to generate.
2985	// size - the size that needs to be stored.
2986	// src - the register which needs to be stored.
2987	// baseNode - the base, relative to which to store the src register.
2988	// offset - the offset that is added to the baseNode to calculate the address to store into.
2989	//
2990	void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
2991	{
2992	emitter* emit = getEmitter();
2993
2994	if (baseNode->OperIsLocalAddr())
2995	{
2996	if (baseNode->gtOper == GT_LCL_FLD_ADDR)
2997	{
2998	offset += baseNode->gtLclFld.gtLclOffs;
2999	}
3000
3001	emit->emitIns_S_R(ins, size, src, baseNode->AsLclVarCommon()->GetLclNum(), offset);
3002	}
3003	else
3004	{
3005	emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
3006	}
3007	}
3008
3009	// Generates CpBlk code by performing a loop unroll
3010	// Preconditions:
3011	// The size argument of the CpBlk node is a constant and <= 64 bytes.
3012	// This may seem small but covers >95% of the cases in several framework assemblies.
3013	//
3014	void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
3015	{
3016	// Make sure we got the arguments of the cpblk operation in the right registers
3017	unsigned size = cpBlkNode->Size();
3018	GenTree* dstAddr = cpBlkNode->Addr();
3019	GenTree* source = cpBlkNode->Data();
3020	GenTree* srcAddr = nullptr;
3021	assert(size <= CPBLK_UNROLL_LIMIT);
3022
3023	emitter* emit = getEmitter();
3024
3025	if (dstAddr->isUsedFromReg())
3026	{
3027	genConsumeReg(dstAddr);
3028	}
3029
3030	if (source->gtOper == GT_IND)
3031	{
3032	srcAddr = source->gtGetOp1();
3033	if (srcAddr->isUsedFromReg())
3034	{
3035	genConsumeReg(srcAddr);
3036	}
3037	}
3038	else
3039	{
3040	noway_assert(source->IsLocal());
3041	// TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
3042	// OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
3043	if (source->OperGet() == GT_LCL_VAR)
3044	{
3045	source->SetOper(GT_LCL_VAR_ADDR);
3046	}
3047	else
3048	{
3049	assert(source->OperGet() == GT_LCL_FLD);
3050	source->SetOper(GT_LCL_FLD_ADDR);
3051	}
3052	srcAddr = source;
3053	}
3054
3055	unsigned offset = `0`;
3056
3057	// If the size of this struct is larger than 16 bytes
3058	// let's use SSE2 to be able to do 16 byte at a time
3059	// loads and stores.
3060
3061	if (size >= XMM_REGSIZE_BYTES)
3062	{
3063	regNumber xmmReg = cpBlkNode->GetSingleTempReg(RBM_ALLFLOAT);
3064	assert(genIsValidFloatReg(xmmReg));
3065	size_t slots = size / XMM_REGSIZE_BYTES;
3066
3067	// TODO: In the below code the load and store instructions are for 16 bytes, but the
3068	// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
3069	// this probably needs to be changed.
3070	while (slots-- > `0`)
3071	{
3072	// Load
3073	genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset);
3074	// Store
3075	genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
3076	offset += XMM_REGSIZE_BYTES;
3077	}
3078	}
3079
3080	// Fill the remainder (15 bytes or less) if there's one.
3081	if ((size & `0xf`) != `0`)
3082	{
3083	// Grab the integer temp register to emit the remaining loads and stores.
3084	regNumber tmpReg = cpBlkNode->GetSingleTempReg(RBM_ALLINT);
3085
3086	if ((size & `8`) != `0`)
3087	{
3088	#ifdef _TARGET_X86_
3089	// TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
3090	for (unsigned savedOffs = offset; offset < savedOffs + `8`; offset += `4`)
3091	{
3092	genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
3093	genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
3094	}
3095	#else // !_TARGET_X86_
3096	genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset);
3097	genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
3098	offset += `8`;
3099	#endif // !_TARGET_X86_
3100	}
3101	if ((size & `4`) != `0`)
3102	{
3103	genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
3104	genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
3105	offset += `4`;
3106	}
3107	if ((size & `2`) != `0`)
3108	{
3109	genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset);
3110	genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
3111	offset += `2`;
3112	}
3113	if ((size & `1`) != `0`)
3114	{
3115	genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset);
3116	genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
3117	}
3118	}
3119	}
3120
3121	// Generate code for CpBlk by using rep movs
3122	// Preconditions:
3123	// The size argument of the CpBlk is a constant and is between
3124	// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
3125	void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
3126	{
3127	// Make sure we got the arguments of the cpblk operation in the right registers
3128	unsigned size = cpBlkNode->Size();
3129	GenTree* dstAddr = cpBlkNode->Addr();
3130	GenTree* source = cpBlkNode->Data();
3131	GenTree* srcAddr = nullptr;
3132
3133	#ifdef DEBUG
3134	assert(dstAddr->isUsedFromReg());
3135	assert(source->isContained());
3136
3137	#ifdef _TARGET_X86_
3138	if (size == `0`)
3139	{
3140	noway_assert(cpBlkNode->OperGet() == GT_STORE_DYN_BLK);
3141	}
3142	else
3143	#endif
3144	{
3145	#ifdef _TARGET_X64_
3146	assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
3147	#else
3148	assert(size > CPBLK_UNROLL_LIMIT);
3149	#endif
3150	}
3151	#endif // DEBUG
3152
3153	genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
3154	instGen(INS_r_movsb);
3155	}
3156
3157	#ifdef FEATURE_PUT_STRUCT_ARG_STK
3158	//------------------------------------------------------------------------
3159	// CodeGen::genMove8IfNeeded: Conditionally move 8 bytes of a struct to the argument area
3160	//
3161	// Arguments:
3162	// size - The size of bytes remaining to be moved
3163	// longTmpReg - The tmp register to be used for the long value
3164	// srcAddr - The address of the source struct
3165	// offset - The current offset being copied
3166	//
3167	// Return Value:
3168	// Returns the number of bytes moved (8 or 0).
3169	//
3170	// Notes:
3171	// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3172	// not an even multiple of 16.
3173	// On x86, longTmpReg must be an xmm reg; on x64 it must be an integer register.
3174	// This is checked by genStoreRegToStackArg.
3175	//
3176	unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree* srcAddr, unsigned offset)
3177	{
3178	#ifdef _TARGET_X86_
3179	instruction longMovIns = INS_movq;
3180	#else // !_TARGET_X86_
3181	instruction longMovIns = INS_mov;
3182	#endif // !_TARGET_X86_
3183	if ((size & `8`) != `0`)
3184	{
3185	genCodeForLoadOffset(longMovIns, EA_8BYTE, longTmpReg, srcAddr, offset);
3186	genStoreRegToStackArg(TYP_LONG, longTmpReg, offset);
3187	return `8`;
3188	}
3189	return `0`;
3190	}
3191
3192	//------------------------------------------------------------------------
3193	// CodeGen::genMove4IfNeeded: Conditionally move 4 bytes of a struct to the argument area
3194	//
3195	// Arguments:
3196	// size - The size of bytes remaining to be moved
3197	// intTmpReg - The tmp register to be used for the long value
3198	// srcAddr - The address of the source struct
3199	// offset - The current offset being copied
3200	//
3201	// Return Value:
3202	// Returns the number of bytes moved (4 or 0).
3203	//
3204	// Notes:
3205	// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3206	// not an even multiple of 16.
3207	// intTmpReg must be an integer register.
3208	// This is checked by genStoreRegToStackArg.
3209	//
3210	unsigned CodeGen::genMove4IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3211	{
3212	if ((size & `4`) != `0`)
3213	{
3214	genCodeForLoadOffset(INS_mov, EA_4BYTE, intTmpReg, srcAddr, offset);
3215	genStoreRegToStackArg(TYP_INT, intTmpReg, offset);
3216	return `4`;
3217	}
3218	return `0`;
3219	}
3220
3221	//------------------------------------------------------------------------
3222	// CodeGen::genMove2IfNeeded: Conditionally move 2 bytes of a struct to the argument area
3223	//
3224	// Arguments:
3225	// size - The size of bytes remaining to be moved
3226	// intTmpReg - The tmp register to be used for the long value
3227	// srcAddr - The address of the source struct
3228	// offset - The current offset being copied
3229	//
3230	// Return Value:
3231	// Returns the number of bytes moved (2 or 0).
3232	//
3233	// Notes:
3234	// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3235	// not an even multiple of 16.
3236	// intTmpReg must be an integer register.
3237	// This is checked by genStoreRegToStackArg.
3238	//
3239	unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3240	{
3241	if ((size & `2`) != `0`)
3242	{
3243	genCodeForLoadOffset(INS_mov, EA_2BYTE, intTmpReg, srcAddr, offset);
3244	genStoreRegToStackArg(TYP_SHORT, intTmpReg, offset);
3245	return `2`;
3246	}
3247	return `0`;
3248	}
3249
3250	//------------------------------------------------------------------------
3251	// CodeGen::genMove1IfNeeded: Conditionally move 1 byte of a struct to the argument area
3252	//
3253	// Arguments:
3254	// size - The size of bytes remaining to be moved
3255	// intTmpReg - The tmp register to be used for the long value
3256	// srcAddr - The address of the source struct
3257	// offset - The current offset being copied
3258	//
3259	// Return Value:
3260	// Returns the number of bytes moved (1 or 0).
3261	//
3262	// Notes:
3263	// This is used in the PutArgStkKindUnroll case, to move any bytes that are
3264	// not an even multiple of 16.
3265	// intTmpReg must be an integer register.
3266	// This is checked by genStoreRegToStackArg.
3267	//
3268	unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
3269	{
3270	if ((size & `1`) != `0`)
3271	{
3272	genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
3273	genStoreRegToStackArg(TYP_BYTE, intTmpReg, offset);
3274	return `1`;
3275	}
3276	return `0`;
3277	}
3278
3279	//---------------------------------------------------------------------------------------------------------------//
3280	// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
3281	//
3282	// Arguments:
3283	// putArgNode - the PutArgStk tree.
3284	//
3285	// Notes:
3286	// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct will be copied to the
3287	// stack.
3288	//
3289	// TODO-Amd64-Unix: Try to share code with copyblk.
3290	// Need refactoring of copyblk before it could be used for putarg_stk.
3291	// The difference for now is that a putarg_stk contains its children, while cpyblk does not.
3292	// This creates differences in code. After some significant refactoring it could be reused.
3293	//
3294	void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
3295	{
3296	GenTree* src = putArgNode->gtOp.gtOp1;
3297	// We will never call this method for SIMD types, which are stored directly
3298	// in genPutStructArgStk().
3299	noway_assert(src->TypeGet() == TYP_STRUCT);
3300
3301	unsigned size = putArgNode->getArgSize();
3302	assert(size <= CPBLK_UNROLL_LIMIT);
3303
3304	emitter* emit = getEmitter();
3305	unsigned putArgOffset = putArgNode->getArgOffset();
3306
3307	assert(src->isContained());
3308
3309	assert(src->gtOper == GT_OBJ);
3310
3311	if (src->gtOp.gtOp1->isUsedFromReg())
3312	{
3313	genConsumeReg(src->gtOp.gtOp1);
3314	}
3315
3316	unsigned offset = `0`;
3317
3318	regNumber xmmTmpReg = REG_NA;
3319	regNumber intTmpReg = REG_NA;
3320	regNumber longTmpReg = REG_NA;
3321	#ifdef _TARGET_X86_
3322	// On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
3323	// less than 16 bytes, we will just be using pushes
3324	if (size >= `8`)
3325	{
3326	xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
3327	longTmpReg = xmmTmpReg;
3328	}
3329	if ((size & `0x7`) != `0`)
3330	{
3331	intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
3332	}
3333	#else // !_TARGET_X86_
3334	// On x64 we use an XMM register only for 16-byte chunks.
3335	if (size >= XMM_REGSIZE_BYTES)
3336	{
3337	xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
3338	}
3339	if ((size & `0xf`) != `0`)
3340	{
3341	intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
3342	longTmpReg = intTmpReg;
3343	}
3344	#endif // !_TARGET_X86_
3345
3346	// If the size of this struct is larger than 16 bytes
3347	// let's use SSE2 to be able to do 16 byte at a time
3348	// loads and stores.
3349	if (size >= XMM_REGSIZE_BYTES)
3350	{
3351	#ifdef _TARGET_X86_
3352	assert(!m_pushStkArg);
3353	#endif // _TARGET_X86_
3354	size_t slots = size / XMM_REGSIZE_BYTES;
3355
3356	assert(putArgNode->gtGetOp1()->isContained());
3357	assert(putArgNode->gtGetOp1()->gtOp.gtOper == GT_OBJ);
3358
3359	// TODO: In the below code the load and store instructions are for 16 bytes, but the
3360	// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
3361	// this probably needs to be changed.
3362	while (slots-- > `0`)
3363	{
3364	// Load
3365	genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
3366
3367	// Store
3368	genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
3369
3370	offset += XMM_REGSIZE_BYTES;
3371	}
3372	}
3373
3374	// Fill the remainder (15 bytes or less) if there's one.
3375	if ((size & `0xf`) != `0`)
3376	{
3377	#ifdef _TARGET_X86_
3378	if (m_pushStkArg)
3379	{
3380	// This case is currently supported only for the case where the total size is
3381	// less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
3382	// order. However, morph has ensured that we have a struct that is an even
3383	// multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
3384	assert(((size & `0xc`) == size) && (offset == `0`));
3385	// If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
3386	// whether we've got an 8 byte chunk, and then push it on the stack.
3387	unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, size & `0x8`);
3388	// Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
3389	// and push it on the stack.
3390	pushedBytes += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, `0`);
3391	}
3392	else
3393	#endif // _TARGET_X86_
3394	{
3395	offset += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, offset);
3396	offset += genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3397	offset += genMove2IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3398	offset += genMove1IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
3399	assert(offset == size);
3400	}
3401	}
3402	}
3403
3404	//------------------------------------------------------------------------
3405	// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
3406	//
3407	// Arguments:
3408	// putArgNode - the PutArgStk tree.
3409	//
3410	// Preconditions:
3411	// The size argument of the PutArgStk (for structs) is a constant and is between
3412	// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
3413	// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
3414	//
3415	void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
3416	{
3417	GenTree* srcAddr = putArgNode->gtGetOp1();
3418	assert(srcAddr->TypeGet() == TYP_STRUCT);
3419	assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
3420
3421	// Make sure we got the arguments of the cpblk operation in the right registers, and that
3422	// 'srcAddr' is contained as expected.
3423	assert(putArgNode->gtRsvdRegs == (RBM_RDI \| RBM_RCX \| RBM_RSI));
3424	assert(srcAddr->isContained());
3425
3426	genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
3427	instGen(INS_r_movsb);
3428	}
3429
3430	//------------------------------------------------------------------------
3431	// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
3432	// must be cleared to zeroes. The native compiler doesn't clear the upper bits
3433	// and there is no way to know if the caller is native or not. So, the upper
3434	// 32 bits of Vector argument on stack are always cleared to zero.
3435	#if defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
3436	void CodeGen::genClearStackVec3ArgUpperBits()
3437	{
3438	#ifdef DEBUG
3439	if (verbose)
3440	{
3441	printf("*************** In genClearStackVec3ArgUpperBits()\n");
3442	}
3443	#endif
3444
3445	assert(compiler->compGeneratingProlog);
3446
3447	unsigned varNum = `0`;
3448
3449	for (unsigned varNum = `0`; varNum < compiler->info.compArgsCount; varNum++)
3450	{
3451	LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
3452	assert(varDsc->lvIsParam);
3453
3454	// Does var has simd12 type?
3455	if (varDsc->lvType != TYP_SIMD12)
3456	{
3457	continue;
3458	}
3459
3460	if (!varDsc->lvIsRegArg)
3461	{
3462	// Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
3463	getEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * `3`, `0`);
3464	}
3465	else
3466	{
3467	// Assume that for x64 linux, an argument is fully in registers
3468	// or fully on stack.
3469	regNumber argReg = varDsc->GetOtherArgReg();
3470
3471	// Clear the upper 32 bits by two shift instructions.
3472	// argReg = argReg << 96
3473	getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, `12`);
3474	// argReg = argReg >> 96
3475	getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, `12`);
3476	}
3477	}
3478	}
3479	#endif // defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
3480	#endif // FEATURE_PUT_STRUCT_ARG_STK
3481
3482	// Generate code for CpObj nodes wich copy structs that have interleaved
3483	// GC pointers.
3484	// This will generate a sequence of movsp instructions for the cases of non-gc members.
3485	// Note that movsp is an alias for movsd on x86 and movsq on x64.
3486	// and calls to the BY_REF_ASSIGN helper otherwise.
3487	void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
3488	{
3489	// Make sure we got the arguments of the cpobj operation in the right registers
3490	GenTree* dstAddr = cpObjNode->Addr();
3491	GenTree* source = cpObjNode->Data();
3492	GenTree* srcAddr = nullptr;
3493	var_types srcAddrType = TYP_BYREF;
3494	bool sourceIsLocal = false;
3495
3496	assert(source->isContained());
3497	if (source->gtOper == GT_IND)
3498	{
3499	srcAddr = source->gtGetOp1();
3500	assert(srcAddr->isUsedFromReg());
3501	}
3502	else
3503	{
3504	noway_assert(source->IsLocal());
3505	sourceIsLocal = true;
3506	}
3507
3508	bool dstOnStack = dstAddr->gtSkipReloadOrCopy()->OperIsLocalAddr();
3509
3510	#ifdef DEBUG
3511
3512	assert(dstAddr->isUsedFromReg());
3513
3514	// If the GenTree node has data about GC pointers, this means we're dealing
3515	// with CpObj, so this requires special logic.
3516	assert(cpObjNode->gtGcPtrCount > `0`);
3517
3518	// MovSp (alias for movsq on x64 and movsd on x86) instruction is used for copying non-gcref fields
3519	// and it needs src = RSI and dst = RDI.
3520	// Either these registers must not contain lclVars, or they must be dying or marked for spill.
3521	// This is because these registers are incremented as we go through the struct.
3522	if (!sourceIsLocal)
3523	{
3524	GenTree* actualSrcAddr = srcAddr->gtSkipReloadOrCopy();
3525	GenTree* actualDstAddr = dstAddr->gtSkipReloadOrCopy();
3526	unsigned srcLclVarNum = BAD_VAR_NUM;
3527	unsigned dstLclVarNum = BAD_VAR_NUM;
3528	bool isSrcAddrLiveOut = false;
3529	bool isDstAddrLiveOut = false;
3530	if (genIsRegCandidateLocal(actualSrcAddr))
3531	{
3532	srcLclVarNum = actualSrcAddr->AsLclVarCommon()->gtLclNum;
3533	isSrcAddrLiveOut = ((actualSrcAddr->gtFlags & (GTF_VAR_DEATH \| GTF_SPILL)) == `0`);
3534	}
3535	if (genIsRegCandidateLocal(actualDstAddr))
3536	{
3537	dstLclVarNum = actualDstAddr->AsLclVarCommon()->gtLclNum;
3538	isDstAddrLiveOut = ((actualDstAddr->gtFlags & (GTF_VAR_DEATH \| GTF_SPILL)) == `0`);
3539	}
3540	assert((actualSrcAddr->gtRegNum != REG_RSI) \|\| !isSrcAddrLiveOut \|\|
3541	((srcLclVarNum == dstLclVarNum) && !isDstAddrLiveOut));
3542	assert((actualDstAddr->gtRegNum != REG_RDI) \|\| !isDstAddrLiveOut \|\|
3543	((srcLclVarNum == dstLclVarNum) && !isSrcAddrLiveOut));
3544	srcAddrType = srcAddr->TypeGet();
3545	}
3546	#endif // DEBUG
3547
3548	// Consume the operands and get them into the right registers.
3549	// They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
3550	genConsumeBlockOp(cpObjNode, REG_RDI, REG_RSI, REG_NA);
3551	gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddrType);
3552	gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
3553
3554	unsigned slots = cpObjNode->gtSlots;
3555
3556	// If we can prove it's on the stack we don't need to use the write barrier.
3557	if (dstOnStack)
3558	{
3559	if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
3560	{
3561	// If the destination of the CpObj is on the stack, make sure we allocated
3562	// RCX to emit the movsp (alias for movsd or movsq for 32 and 64 bits respectively).
3563	assert((cpObjNode->gtRsvdRegs & RBM_RCX) != `0`);
3564
3565	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
3566	instGen(INS_r_movsp);
3567	}
3568	else
3569	{
3570	// For small structs, it's better to emit a sequence of movsp than to
3571	// emit a rep movsp instruction.
3572	while (slots > `0`)
3573	{
3574	instGen(INS_movsp);
3575	slots--;
3576	}
3577	}
3578	}
3579	else
3580	{
3581	BYTE* gcPtrs = cpObjNode->gtGcPtrs;
3582	unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
3583
3584	unsigned i = `0`;
3585	while (i < slots)
3586	{
3587	switch (gcPtrs[i])
3588	{
3589	case TYPE_GC_NONE:
3590	// Let's see if we can use rep movsp instead of a sequence of movsp instructions
3591	// to save cycles and code size.
3592	{
3593	unsigned nonGcSlotCount = `0`;
3594
3595	do
3596	{
3597	nonGcSlotCount++;
3598	i++;
3599	} while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
3600
3601	// If we have a very small contiguous non-gc region, it's better just to
3602	// emit a sequence of movsp instructions
3603	if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
3604	{
3605	while (nonGcSlotCount > `0`)
3606	{
3607	instGen(INS_movsp);
3608	nonGcSlotCount--;
3609	}
3610	}
3611	else
3612	{
3613	// Otherwise, we can save code-size and improve CQ by emitting
3614	// rep movsp (alias for movsd/movsq for x86/x64)
3615	assert((cpObjNode->gtRsvdRegs & RBM_RCX) != `0`);
3616
3617	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
3618	instGen(INS_r_movsp);
3619	}
3620	}
3621	break;
3622	default:
3623	// We have a GC pointer, call the memory barrier.
3624	genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, `0`, EA_PTRSIZE);
3625	gcPtrCount--;
3626	i++;
3627	}
3628	}
3629
3630	assert(gcPtrCount == `0`);
3631	}
3632
3633	// Clear the gcInfo for RSI and RDI.
3634	// While we normally update GC info prior to the last instruction that uses them,
3635	// these actually live into the helper call.
3636	gcInfo.gcMarkRegSetNpt(RBM_RSI);
3637	gcInfo.gcMarkRegSetNpt(RBM_RDI);
3638	}
3639
3640	// Generate code for a CpBlk node by the means of the VM memcpy helper call
3641	// Preconditions:
3642	// a) The size argument of the CpBlk is not an integer constant
3643	// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes.
3644	void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode)
3645	{
3646	#ifdef _TARGET_AMD64_
3647	// Make sure we got the arguments of the cpblk operation in the right registers
3648	unsigned blockSize = cpBlkNode->Size();
3649	GenTree* dstAddr = cpBlkNode->Addr();
3650	GenTree* source = cpBlkNode->Data();
3651	GenTree* srcAddr = nullptr;
3652
3653	// Size goes in arg2
3654	if (blockSize != `0`)
3655	{
3656	assert(blockSize >= CPBLK_MOVS_LIMIT);
3657	assert((cpBlkNode->gtRsvdRegs & RBM_ARG_2) != `0`);
3658	}
3659	else
3660	{
3661	noway_assert(cpBlkNode->gtOper == GT_STORE_DYN_BLK);
3662	}
3663
3664	// Source address goes in arg1
3665	if (source->gtOper == GT_IND)
3666	{
3667	srcAddr = source->gtGetOp1();
3668	assert(srcAddr->isUsedFromReg());
3669	}
3670	else
3671	{
3672	noway_assert(source->IsLocal());
3673	assert((cpBlkNode->gtRsvdRegs & RBM_ARG_1) != `0`);
3674	inst_RV_TT(INS_lea, REG_ARG_1, source, `0`, EA_BYREF);
3675	}
3676
3677	genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
3678
3679	genEmitHelperCall(CORINFO_HELP_MEMCPY, `0`, EA_UNKNOWN);
3680	#else // !_TARGET_AMD64_
3681	noway_assert(false && "Helper call for CpBlk is not needed.");
3682	#endif // !_TARGET_AMD64_
3683	}
3684
3685	// generate code do a switch statement based on a table of ip-relative offsets
3686	void CodeGen::genTableBasedSwitch(GenTree* treeNode)
3687	{
3688	genConsumeOperands(treeNode->AsOp());
3689	regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum;
3690	regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
3691
3692	regNumber tmpReg = treeNode->GetSingleTempReg();
3693
3694	// load the ip-relative offset (which is relative to start of fgFirstBB)
3695	getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, `4`, `0`);
3696
3697	// add it to the absolute address of fgFirstBB
3698	compiler->fgFirstBB->bbFlags \|= BBF_JMP_TARGET;
3699	getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
3700	getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
3701	// jmp baseReg
3702	getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
3703	}
3704
3705	// emits the table and an instruction to get the address of the first element
3706	void CodeGen::genJumpTable(GenTree* treeNode)
3707	{
3708	noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
3709	assert(treeNode->OperGet() == GT_JMPTABLE);
3710
3711	unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
3712	BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
3713	unsigned jmpTabOffs;
3714	unsigned jmpTabBase;
3715
3716	jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
3717
3718	jmpTabOffs = `0`;
3719
3720	JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
3721
3722	for (unsigned i = `0`; i < jumpCount; i++)
3723	{
3724	BasicBlock* target = *jumpTable++;
3725	noway_assert(target->bbFlags & BBF_JMP_TARGET);
3726
3727	JITDUMP(" DD L_M%03u_" FMT_BB "\n", Compiler::s_compMethodsCount, target->bbNum);
3728
3729	getEmitter()->emitDataGenData(i, target);
3730	};
3731
3732	getEmitter()->emitDataGenEnd();
3733
3734	// Access to inline data is 'abstracted' by a special type of static member
3735	// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
3736	// to constant data, not a real static field.
3737	getEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->gtRegNum,
3738	compiler->eeFindJitDataOffs(jmpTabBase), `0`);
3739	genProduceReg(treeNode);
3740	}
3741
3742	//------------------------------------------------------------------------
3743	// genCodeForLockAdd: Generate code for a GT_LOCKADD node
3744	//
3745	// Arguments:
3746	// node - the GT_LOCKADD node
3747	//
3748	void CodeGen::genCodeForLockAdd(GenTreeOp* node)
3749	{
3750	assert(node->OperIs(GT_LOCKADD));
3751
3752	GenTree* addr = node->gtGetOp1();
3753	GenTree* data = node->gtGetOp2();
3754	emitAttr size = emitActualTypeSize(data->TypeGet());
3755
3756	assert(addr->isUsedFromReg());
3757	assert(data->isUsedFromReg() \|\| data->isContainedIntOrIImmed());
3758	assert((size == EA_4BYTE) \|\| (size == EA_PTRSIZE));
3759
3760	genConsumeOperands(node);
3761	instGen(INS_lock);
3762
3763	if (data->isContainedIntOrIImmed())
3764	{
3765	int imm = static_cast<int>(data->AsIntCon()->IconValue());
3766	assert(imm == data->AsIntCon()->IconValue());
3767	getEmitter()->emitIns_I_AR(INS_add, size, imm, addr->gtRegNum, `0`);
3768	}
3769	else
3770	{
3771	getEmitter()->emitIns_AR_R(INS_add, size, data->gtRegNum, addr->gtRegNum, `0`);
3772	}
3773	}
3774
3775	//------------------------------------------------------------------------
3776	// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
3777	//
3778	// Arguments:
3779	// node - the GT_XADD/XCHG node
3780	//
3781	void CodeGen::genLockedInstructions(GenTreeOp* node)
3782	{
3783	assert(node->OperIs(GT_XADD, GT_XCHG));
3784
3785	GenTree* addr = node->gtGetOp1();
3786	GenTree* data = node->gtGetOp2();
3787	emitAttr size = emitTypeSize(node->TypeGet());
3788
3789	assert(addr->isUsedFromReg());
3790	assert(data->isUsedFromReg());
3791	assert((size == EA_4BYTE) \|\| (size == EA_PTRSIZE));
3792
3793	genConsumeOperands(node);
3794
3795	if (node->gtRegNum != data->gtRegNum)
3796	{
3797	// If the destination register is different from the data register then we need
3798	// to first move the data to the target register. Make sure we don't overwrite
3799	// the address, the register allocator should have taken care of this.
3800	assert(node->gtRegNum != addr->gtRegNum);
3801	getEmitter()->emitIns_R_R(INS_mov, size, node->gtRegNum, data->gtRegNum);
3802	}
3803
3804	instruction ins = node->OperIs(GT_XADD) ? INS_xadd : INS_xchg;
3805
3806	// XCHG has an implied lock prefix when the first operand is a memory operand.
3807	if (ins != INS_xchg)
3808	{
3809	instGen(INS_lock);
3810	}
3811
3812	getEmitter()->emitIns_AR_R(ins, size, node->gtRegNum, addr->gtRegNum, `0`);
3813	genProduceReg(node);
3814	}
3815
3816	//------------------------------------------------------------------------
3817	// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
3818	//
3819	// Arguments:
3820	// tree - the GT_CMPXCHG node
3821	//
3822	void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
3823	{
3824	assert(tree->OperIs(GT_CMPXCHG));
3825
3826	var_types targetType = tree->TypeGet();
3827	regNumber targetReg = tree->gtRegNum;
3828
3829	GenTree* location = tree->gtOpLocation; // arg1
3830	GenTree* value = tree->gtOpValue; // arg2
3831	GenTree* comparand = tree->gtOpComparand; // arg3
3832
3833	assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
3834	assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
3835
3836	genConsumeReg(location);
3837	genConsumeReg(value);
3838	genConsumeReg(comparand);
3839
3840	// comparand goes to RAX;
3841	// Note that we must issue this move after the genConsumeRegs(), in case any of the above
3842	// have a GT_COPY from RAX.
3843	if (comparand->gtRegNum != REG_RAX)
3844	{
3845	inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
3846	}
3847
3848	// location is Rm
3849	instGen(INS_lock);
3850
3851	getEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, `0`);
3852
3853	// Result is in RAX
3854	if (targetReg != REG_RAX)
3855	{
3856	inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
3857	}
3858
3859	genProduceReg(tree);
3860	}
3861
3862	// generate code for BoundsCheck nodes
3863	void CodeGen::genRangeCheck(GenTree* oper)
3864	{
3865	noway_assert(oper->OperIsBoundsCheck());
3866	GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
3867
3868	GenTree* arrIndex = bndsChk->gtIndex;
3869	GenTree* arrLen = bndsChk->gtArrLen;
3870	GenTree* arrRef = nullptr;
3871	int lenOffset = `0`;
3872
3873	GenTree * src1, *src2;
3874	emitJumpKind jmpKind;
3875
3876	genConsumeRegs(arrIndex);
3877	genConsumeRegs(arrLen);
3878
3879	if (arrIndex->isContainedIntOrIImmed())
3880	{
3881	// arrIndex is a contained constant. In this case
3882	// we will generate one of the following
3883	// cmp [mem], immed (if arrLen is a memory op)
3884	// cmp reg, immed (if arrLen is in a reg)
3885	//
3886	// That is arrLen cannot be a contained immed.
3887	assert(!arrLen->isContainedIntOrIImmed());
3888
3889	src1 = arrLen;
3890	src2 = arrIndex;
3891	jmpKind = EJ_jbe;
3892	}
3893	else
3894	{
3895	// arrIndex could either be a contained memory op or a reg
3896	// In this case we will generate one of the following
3897	// cmp [mem], immed (if arrLen is a constant)
3898	// cmp [mem], reg (if arrLen is in a reg)
3899	// cmp reg, immed (if arrIndex is in a reg)
3900	// cmp reg1, reg2 (if arraIndex is in reg1)
3901	// cmp reg, [mem] (if arrLen is a memory op)
3902	//
3903	// That is only one of arrIndex or arrLen can be a memory op.
3904	assert(!arrIndex->isUsedFromMemory() \|\| !arrLen->isUsedFromMemory());
3905
3906	src1 = arrIndex;
3907	src2 = arrLen;
3908	jmpKind = EJ_jae;
3909	}
3910
3911	var_types bndsChkType = src2->TypeGet();
3912	#if DEBUG
3913	// Bounds checks can only be 32 or 64 bit sized comparisons.
3914	assert(bndsChkType == TYP_INT \|\| bndsChkType == TYP_LONG);
3915
3916	// The type of the bounds check should always wide enough to compare against the index.
3917	assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
3918	#endif // DEBUG
3919
3920	getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(bndsChkType), src1, src2);
3921	genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
3922	}
3923
3924	//---------------------------------------------------------------------
3925	// genCodeForPhysReg - generate code for a GT_PHYSREG node
3926	//
3927	// Arguments
3928	// tree - the GT_PHYSREG node
3929	//
3930	// Return value:
3931	// None
3932	//
3933	void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
3934	{
3935	assert(tree->OperIs(GT_PHYSREG));
3936
3937	var_types targetType = tree->TypeGet();
3938	regNumber targetReg = tree->gtRegNum;
3939
3940	if (targetReg != tree->gtSrcReg)
3941	{
3942	inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
3943	genTransferRegGCState(targetReg, tree->gtSrcReg);
3944	}
3945
3946	genProduceReg(tree);
3947	}
3948
3949	//---------------------------------------------------------------------
3950	// genCodeForNullCheck - generate code for a GT_NULLCHECK node
3951	//
3952	// Arguments
3953	// tree - the GT_NULLCHECK node
3954	//
3955	// Return value:
3956	// None
3957	//
3958	void CodeGen::genCodeForNullCheck(GenTreeOp* tree)
3959	{
3960	assert(tree->OperIs(GT_NULLCHECK));
3961
3962	assert(tree->gtOp1->isUsedFromReg());
3963	regNumber reg = genConsumeReg(tree->gtOp1);
3964	getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, `0`);
3965	}
3966
3967	//------------------------------------------------------------------------
3968	// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
3969	// lower bound for the given dimension.
3970	//
3971	// Arguments:
3972	// elemType - the element type of the array
3973	// rank - the rank of the array
3974	// dimension - the dimension for which the lower bound offset will be returned.
3975	//
3976	// Return Value:
3977	// The offset.
3978
3979	unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
3980	{
3981	// Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
3982	return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
3983	}
3984
3985	//------------------------------------------------------------------------
3986	// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
3987	// size for the given dimension.
3988	//
3989	// Arguments:
3990	// elemType - the element type of the array
3991	// rank - the rank of the array
3992	// dimension - the dimension for which the lower bound offset will be returned.
3993	//
3994	// Return Value:
3995	// The offset.
3996
3997	unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
3998	{
3999	// Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
4000	return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
4001	}
4002
4003	//------------------------------------------------------------------------
4004	// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
4005	// producing the effective index by subtracting the lower bound.
4006	//
4007	// Arguments:
4008	// arrIndex - the node for which we're generating code
4009	//
4010	// Return Value:
4011	// None.
4012	//
4013
4014	void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
4015	{
4016	GenTree* arrObj = arrIndex->ArrObj();
4017	GenTree* indexNode = arrIndex->IndexExpr();
4018
4019	regNumber arrReg = genConsumeReg(arrObj);
4020	regNumber indexReg = genConsumeReg(indexNode);
4021	regNumber tgtReg = arrIndex->gtRegNum;
4022
4023	unsigned dim = arrIndex->gtCurrDim;
4024	unsigned rank = arrIndex->gtArrRank;
4025	var_types elemType = arrIndex->gtArrElemType;
4026
4027	noway_assert(tgtReg != REG_NA);
4028
4029	// Subtract the lower bound for this dimension.
4030	// TODO-XArch-CQ: make this contained if it's an immediate that fits.
4031	if (tgtReg != indexReg)
4032	{
4033	inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
4034	}
4035	getEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
4036	genOffsetOfMDArrayLowerBound(elemType, rank, dim));
4037	getEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
4038	genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
4039	genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
4040
4041	genProduceReg(arrIndex);
4042	}
4043
4044	//------------------------------------------------------------------------
4045	// genCodeForArrOffset: Generates code to compute the flattened array offset for
4046	// one dimension of an array reference:
4047	// result = (prevDimOffset dimSize) + effectiveIndex*
4048	// where dimSize is obtained from the arrObj operand
4049	//
4050	// Arguments:
4051	// arrOffset - the node for which we're generating code
4052	//
4053	// Return Value:
4054	// None.
4055	//
4056	// Notes:
4057	// dimSize and effectiveIndex are always non-negative, the former by design,
4058	// and the latter because it has been normalized to be zero-based.
4059
4060	void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
4061	{
4062	GenTree* offsetNode = arrOffset->gtOffset;
4063	GenTree* indexNode = arrOffset->gtIndex;
4064	GenTree* arrObj = arrOffset->gtArrObj;
4065
4066	regNumber tgtReg = arrOffset->gtRegNum;
4067	assert(tgtReg != REG_NA);
4068
4069	unsigned dim = arrOffset->gtCurrDim;
4070	unsigned rank = arrOffset->gtArrRank;
4071	var_types elemType = arrOffset->gtArrElemType;
4072
4073	// First, consume the operands in the correct order.
4074	regNumber offsetReg = REG_NA;
4075	regNumber tmpReg = REG_NA;
4076	if (!offsetNode->IsIntegralConst(`0`))
4077	{
4078	offsetReg = genConsumeReg(offsetNode);
4079
4080	// We will use a temp register for the offsetscale+effectiveIndex computation.*
4081	tmpReg = arrOffset->GetSingleTempReg();
4082	}
4083	else
4084	{
4085	assert(offsetNode->isContained());
4086	}
4087	regNumber indexReg = genConsumeReg(indexNode);
4088	// Although arrReg may not be used in the constant-index case, if we have generated
4089	// the value into a register, we must consume it, otherwise we will fail to end the
4090	// live range of the gc ptr.
4091	// TODO-CQ: Currently arrObj will always have a register allocated to it.
4092	// We could avoid allocating a register for it, which would be of value if the arrObj
4093	// is an on-stack lclVar.
4094	regNumber arrReg = REG_NA;
4095	if (arrObj->gtHasReg())
4096	{
4097	arrReg = genConsumeReg(arrObj);
4098	}
4099
4100	if (!offsetNode->IsIntegralConst(`0`))
4101	{
4102	assert(tmpReg != REG_NA);
4103	assert(arrReg != REG_NA);
4104
4105	// Evaluate tgtReg = offsetRegdim_size + indexReg.*
4106	// tmpReg is used to load dim_size and the result of the multiplication.
4107	// Note that dim_size will never be negative.
4108
4109	getEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
4110	genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
4111	inst_RV_RV(INS_imul, tmpReg, offsetReg);
4112
4113	if (tmpReg == tgtReg)
4114	{
4115	inst_RV_RV(INS_add, tmpReg, indexReg);
4116	}
4117	else
4118	{
4119	if (indexReg != tgtReg)
4120	{
4121	inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
4122	}
4123	inst_RV_RV(INS_add, tgtReg, tmpReg);
4124	}
4125	}
4126	else
4127	{
4128	if (indexReg != tgtReg)
4129	{
4130	inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
4131	}
4132	}
4133	genProduceReg(arrOffset);
4134	}
4135
4136	instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
4137	{
4138	instruction ins;
4139
4140	// Operations on SIMD vectors shouldn't come this path
4141	assert(!varTypeIsSIMD(type));
4142	if (varTypeIsFloating(type))
4143	{
4144	return ins_MathOp(oper, type);
4145	}
4146
4147	switch (oper)
4148	{
4149	case GT_ADD:
4150	ins = INS_add;
4151	break;
4152	case GT_AND:
4153	ins = INS_and;
4154	break;
4155	case GT_LSH:
4156	ins = INS_shl;
4157	break;
4158	case GT_MUL:
4159	ins = INS_imul;
4160	break;
4161	case GT_NEG:
4162	ins = INS_neg;
4163	break;
4164	case GT_NOT:
4165	ins = INS_not;
4166	break;
4167	case GT_OR:
4168	ins = INS_or;
4169	break;
4170	case GT_ROL:
4171	ins = INS_rol;
4172	break;
4173	case GT_ROR:
4174	ins = INS_ror;
4175	break;
4176	case GT_RSH:
4177	ins = INS_sar;
4178	break;
4179	case GT_RSZ:
4180	ins = INS_shr;
4181	break;
4182	case GT_SUB:
4183	ins = INS_sub;
4184	break;
4185	case GT_XOR:
4186	ins = INS_xor;
4187	break;
4188	#if !defined(_TARGET_64BIT_)
4189	case GT_ADD_LO:
4190	ins = INS_add;
4191	break;
4192	case GT_ADD_HI:
4193	ins = INS_adc;
4194	break;
4195	case GT_SUB_LO:
4196	ins = INS_sub;
4197	break;
4198	case GT_SUB_HI:
4199	ins = INS_sbb;
4200	break;
4201	case GT_LSH_HI:
4202	ins = INS_shld;
4203	break;
4204	case GT_RSH_LO:
4205	ins = INS_shrd;
4206	break;
4207	#endif // !defined(_TARGET_64BIT_)
4208	default:
4209	unreached();
4210	break;
4211	}
4212	return ins;
4213	}
4214
4215	//------------------------------------------------------------------------
4216	// genCodeForShift: Generates the code sequence for a GenTree node that
4217	// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
4218	//
4219	// Arguments:
4220	// tree - the bit shift node (that specifies the type of bit shift to perform).
4221	//
4222	// Assumptions:
4223	// a) All GenTrees are register allocated.
4224	// b) The shift-by-amount in tree->gtOp.gtOp2 is either a contained constant or
4225	// it's a register-allocated expression. If it is in a register that is
4226	// not RCX, it will be moved to RCX (so RCX better not be in use!).
4227	//
4228	void CodeGen::genCodeForShift(GenTree* tree)
4229	{
4230	// Only the non-RMW case here.
4231	assert(tree->OperIsShiftOrRotate());
4232	assert(tree->gtOp.gtOp1->isUsedFromReg());
4233	assert(tree->gtRegNum != REG_NA);
4234
4235	genConsumeOperands(tree->AsOp());
4236
4237	var_types targetType = tree->TypeGet();
4238	instruction ins = genGetInsForOper(tree->OperGet(), targetType);
4239
4240	GenTree* operand = tree->gtGetOp1();
4241	regNumber operandReg = operand->gtRegNum;
4242
4243	GenTree* shiftBy = tree->gtGetOp2();
4244
4245	if (shiftBy->isContainedIntOrIImmed())
4246	{
4247	// First, move the operand to the destination register and
4248	// later on perform the shift in-place.
4249	// (LSRA will try to avoid this situation through preferencing.)
4250	if (tree->gtRegNum != operandReg)
4251	{
4252	inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
4253	}
4254
4255	int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
4256	inst_RV_SH(ins, emitTypeSize(tree), tree->gtRegNum, shiftByValue);
4257	}
4258	else
4259	{
4260	// We must have the number of bits to shift stored in ECX, since we constrained this node to
4261	// sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
4262	// register destination requirement.
4263	genCopyRegIfNeeded(shiftBy, REG_RCX);
4264
4265	// The operand to be shifted must not be in ECX
4266	noway_assert(operandReg != REG_RCX);
4267
4268	if (tree->gtRegNum != operandReg)
4269	{
4270	inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
4271	}
4272	inst_RV_CL(ins, tree->gtRegNum, targetType);
4273	}
4274
4275	genProduceReg(tree);
4276	}
4277
4278	#ifdef _TARGET_X86_
4279	//------------------------------------------------------------------------
4280	// genCodeForShiftLong: Generates the code sequence for a GenTree node that
4281	// represents a three operand bit shift or rotate operation (<<Hi, >>Lo).
4282	//
4283	// Arguments:
4284	// tree - the bit shift node (that specifies the type of bit shift to perform).
4285	//
4286	// Assumptions:
4287	// a) All GenTrees are register allocated.
4288	// b) The shift-by-amount in tree->gtOp.gtOp2 is a contained constant
4289	//
4290	// TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
4291	// need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
4292	// targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
4293	// contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
4294	//
4295	void CodeGen::genCodeForShiftLong(GenTree* tree)
4296	{
4297	// Only the non-RMW case here.
4298	genTreeOps oper = tree->OperGet();
4299	assert(oper == GT_LSH_HI \|\| oper == GT_RSH_LO);
4300
4301	GenTree* operand = tree->gtOp.gtOp1;
4302	assert(operand->OperGet() == GT_LONG);
4303	assert(operand->gtOp.gtOp1->isUsedFromReg());
4304	assert(operand->gtOp.gtOp2->isUsedFromReg());
4305
4306	GenTree* operandLo = operand->gtGetOp1();
4307	GenTree* operandHi = operand->gtGetOp2();
4308
4309	regNumber regLo = operandLo->gtRegNum;
4310	regNumber regHi = operandHi->gtRegNum;
4311
4312	genConsumeOperands(tree->AsOp());
4313
4314	var_types targetType = tree->TypeGet();
4315	instruction ins = genGetInsForOper(oper, targetType);
4316
4317	GenTree* shiftBy = tree->gtGetOp2();
4318
4319	assert(shiftBy->isContainedIntOrIImmed());
4320
4321	unsigned int count = shiftBy->AsIntConCommon()->IconValue();
4322
4323	regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
4324
4325	if (regResult != tree->gtRegNum)
4326	{
4327	inst_RV_RV(INS_mov, tree->gtRegNum, regResult, targetType);
4328	}
4329
4330	if (oper == GT_LSH_HI)
4331	{
4332	inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regLo, count);
4333	}
4334	else
4335	{
4336	assert(oper == GT_RSH_LO);
4337	inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regHi, count);
4338	}
4339
4340	genProduceReg(tree);
4341	}
4342	#endif
4343
4344	//------------------------------------------------------------------------
4345	// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
4346	// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
4347	// GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
4348	//
4349	// Arguments:
4350	// storeIndNode: the GT_STOREIND node.
4351	//
4352	void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
4353	{
4354	GenTree* data = storeInd->Data();
4355	GenTree* addr = storeInd->Addr();
4356
4357	assert(data->OperIsShift() \|\| data->OperIsRotate());
4358
4359	// This function only handles the RMW case.
4360	assert(data->gtOp.gtOp1->isUsedFromMemory());
4361	assert(data->gtOp.gtOp1->isIndir());
4362	assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
4363	assert(data->gtRegNum == REG_NA);
4364
4365	var_types targetType = data->TypeGet();
4366	genTreeOps oper = data->OperGet();
4367	instruction ins = genGetInsForOper(oper, targetType);
4368	emitAttr attr = EA_ATTR(genTypeSize(targetType));
4369
4370	GenTree* shiftBy = data->gtOp.gtOp2;
4371	if (shiftBy->isContainedIntOrIImmed())
4372	{
4373	int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
4374	ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
4375	if (shiftByValue == `1`)
4376	{
4377	// There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
4378	getEmitter()->emitInsRMW(ins, attr, storeInd);
4379	}
4380	else
4381	{
4382	getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
4383	}
4384	}
4385	else
4386	{
4387	// We must have the number of bits to shift stored in ECX, since we constrained this node to
4388	// sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
4389	// register destination requirement.
4390	regNumber shiftReg = shiftBy->gtRegNum;
4391	genCopyRegIfNeeded(shiftBy, REG_RCX);
4392
4393	// The shiftBy operand is implicit, so call the unary version of emitInsRMW.
4394	getEmitter()->emitInsRMW(ins, attr, storeInd);
4395	}
4396	}
4397
4398	//------------------------------------------------------------------------
4399	// genCodeForLclAddr: Generates the code for GT_LCL_FLD_ADDR/GT_LCL_VAR_ADDR.
4400	//
4401	// Arguments:
4402	// tree - the node.
4403	//
4404	void CodeGen::genCodeForLclAddr(GenTree* tree)
4405	{
4406	assert(tree->OperIs(GT_LCL_FLD_ADDR, GT_LCL_VAR_ADDR));
4407
4408	var_types targetType = tree->TypeGet();
4409	regNumber targetReg = tree->gtRegNum;
4410
4411	// Address of a local var.
4412	noway_assert(targetType == TYP_BYREF);
4413
4414	inst_RV_TT(INS_lea, targetReg, tree, `0`, EA_BYREF);
4415	genProduceReg(tree);
4416	}
4417
4418	//------------------------------------------------------------------------
4419	// genCodeForLclFld: Produce code for a GT_LCL_FLD node.
4420	//
4421	// Arguments:
4422	// tree - the GT_LCL_FLD node
4423	//
4424	void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
4425	{
4426	assert(tree->OperIs(GT_LCL_FLD));
4427
4428	var_types targetType = tree->TypeGet();
4429	regNumber targetReg = tree->gtRegNum;
4430
4431	noway_assert(targetReg != REG_NA);
4432
4433	#ifdef FEATURE_SIMD
4434	// Loading of TYP_SIMD12 (i.e. Vector3) field
4435	if (targetType == TYP_SIMD12)
4436	{
4437	genLoadLclTypeSIMD12(tree);
4438	return;
4439	}
4440	#endif
4441
4442	noway_assert(targetType != TYP_STRUCT);
4443
4444	emitAttr size = emitTypeSize(targetType);
4445	unsigned offs = tree->gtLclOffs;
4446	unsigned varNum = tree->gtLclNum;
4447	assert(varNum < compiler->lvaCount);
4448
4449	getEmitter()->emitIns_R_S(ins_Load(targetType), size, targetReg, varNum, offs);
4450
4451	genProduceReg(tree);
4452	}
4453
4454	//------------------------------------------------------------------------
4455	// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
4456	//
4457	// Arguments:
4458	// tree - the GT_LCL_VAR node
4459	//
4460	void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
4461	{
4462	assert(tree->OperIs(GT_LCL_VAR));
4463
4464	// lcl_vars are not defs
4465	assert((tree->gtFlags & GTF_VAR_DEF) == `0`);
4466
4467	bool isRegCandidate = compiler->lvaTable[tree->gtLclNum].lvIsRegCandidate();
4468
4469	// If this is a register candidate that has been spilled, genConsumeReg() will
4470	// reload it at the point of use. Otherwise, if it's not in a register, we load it here.
4471
4472	if (!isRegCandidate && !(tree->gtFlags & GTF_SPILLED))
4473	{
4474	#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
4475	// Loading of TYP_SIMD12 (i.e. Vector3) variable
4476	if (tree->TypeGet() == TYP_SIMD12)
4477	{
4478	genLoadLclTypeSIMD12(tree);
4479	return;
4480	}
4481	#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
4482
4483	getEmitter()->emitIns_R_S(ins_Load(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(tree->gtLclNum)),
4484	emitTypeSize(tree), tree->gtRegNum, tree->gtLclNum, `0`);
4485	genProduceReg(tree);
4486	}
4487	}
4488
4489	//------------------------------------------------------------------------
4490	// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
4491	//
4492	// Arguments:
4493	// tree - the GT_STORE_LCL_FLD node
4494	//
4495	void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
4496	{
4497	assert(tree->OperIs(GT_STORE_LCL_FLD));
4498
4499	var_types targetType = tree->TypeGet();
4500	noway_assert(targetType != TYP_STRUCT);
4501	assert(!varTypeIsFloating(targetType) \|\| (targetType == tree->gtOp1->TypeGet()));
4502
4503	#ifdef FEATURE_SIMD
4504	// storing of TYP_SIMD12 (i.e. Vector3) field
4505	if (tree->TypeGet() == TYP_SIMD12)
4506	{
4507	genStoreLclTypeSIMD12(tree);
4508	return;
4509	}
4510	#endif // FEATURE_SIMD
4511
4512	GenTree* op1 = tree->gtGetOp1();
4513	genConsumeRegs(op1);
4514	getEmitter()->emitInsBinary(ins_Store(targetType), emitTypeSize(tree), tree, op1);
4515
4516	genUpdateLife(tree);
4517	}
4518
4519	//------------------------------------------------------------------------
4520	// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
4521	//
4522	// Arguments:
4523	// tree - the GT_STORE_LCL_VAR node
4524	//
4525	void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* tree)
4526	{
4527	assert(tree->OperIs(GT_STORE_LCL_VAR));
4528
4529	var_types targetType = tree->TypeGet();
4530	regNumber targetReg = tree->gtRegNum;
4531	emitter* emit = getEmitter();
4532
4533	GenTree* op1 = tree->gtGetOp1();
4534
4535	// var = call, where call returns a multi-reg return value
4536	// case is handled separately.
4537	if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
4538	{
4539	genMultiRegCallStoreToLocal(tree);
4540	}
4541	else
4542	{
4543	noway_assert(targetType != TYP_STRUCT);
4544	assert(!varTypeIsFloating(targetType) \|\| (targetType == op1->TypeGet()));
4545
4546	unsigned lclNum = tree->gtLclNum;
4547	LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
4548
4549	// Ensure that lclVar nodes are typed correctly.
4550	assert(!varDsc->lvNormalizeOnStore() \|\| (targetType == genActualType(varDsc->TypeGet())));
4551
4552	#if !defined(_TARGET_64BIT_)
4553	if (targetType == TYP_LONG)
4554	{
4555	genStoreLongLclVar(tree);
4556	return;
4557	}
4558	#endif // !defined(_TARGET_64BIT_)
4559
4560	#ifdef FEATURE_SIMD
4561	// storing of TYP_SIMD12 (i.e. Vector3) field
4562	if (targetType == TYP_SIMD12)
4563	{
4564	genStoreLclTypeSIMD12(tree);
4565	return;
4566	}
4567
4568	if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
4569	{
4570	// This is only possible for a zero-init.
4571	noway_assert(op1->IsIntegralConst(`0`));
4572	genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
4573	genProduceReg(tree);
4574	return;
4575	}
4576	#endif // FEATURE_SIMD
4577
4578	genConsumeRegs(op1);
4579
4580	if (targetReg == REG_NA)
4581	{
4582	// stack store
4583	emit->emitInsStoreLcl(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
4584	emitTypeSize(targetType), tree);
4585	varDsc->lvRegNum = REG_STK;
4586	}
4587	else
4588	{
4589	// Look for the case where we have a constant zero which we've marked for reuse,
4590	// but which isn't actually in the register we want. In that case, it's better to create
4591	// zero in the target register, because an xor is smaller than a copy. Note that we could
4592	// potentially handle this in the register allocator, but we can't always catch it there
4593	// because the target may not have a register allocated for it yet.
4594	if (op1->isUsedFromReg() && (op1->gtRegNum != targetReg) && (op1->IsIntegralConst(`0`) \|\| op1->IsFPZero()))
4595	{
4596	op1->gtRegNum = REG_NA;
4597	op1->ResetReuseRegVal();
4598	op1->SetContained();
4599	}
4600
4601	if (!op1->isUsedFromReg())
4602	{
4603	// Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
4604	// must be a constant. However, in the future we might want to support an operand used from
4605	// memory. This is a bit tricky because we have to decide it can be used from memory before
4606	// register allocation,
4607	// and this would be a case where, once that's done, we need to mark that node as always
4608	// requiring a register - which we always assume now anyway, but once we "optimize" that
4609	// we'll have to take cases like this into account.
4610	assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
4611	genSetRegToConst(targetReg, targetType, op1);
4612	}
4613	else if (op1->gtRegNum != targetReg)
4614	{
4615	assert(op1->gtRegNum != REG_NA);
4616	emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(tree), tree, op1);
4617	}
4618	}
4619	}
4620
4621	if (targetReg != REG_NA)
4622	{
4623	genProduceReg(tree);
4624	}
4625	}
4626
4627	//------------------------------------------------------------------------
4628	// genCodeForIndexAddr: Produce code for a GT_INDEX_ADDR node.
4629	//
4630	// Arguments:
4631	// tree - the GT_INDEX_ADDR node
4632	//
4633	void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
4634	{
4635	GenTree* const base = node->Arr();
4636	GenTree* const index = node->Index();
4637
4638	genConsumeReg(base);
4639	genConsumeReg(index);
4640
4641	// NOTE: `genConsumeReg` marks the consumed register as not a GC pointer, as it assumes that the input registers
4642	// die at the first instruction generated by the node. This is not the case for `INDEX_ADDR`, however, as the
4643	// base register is multiply-used. As such, we need to mark the base register as containing a GC pointer until
4644	// we are finished generating the code for this node.
4645
4646	gcInfo.gcMarkRegPtrVal(base->gtRegNum, base->TypeGet());
4647	assert(!varTypeIsGC(index->TypeGet()));
4648
4649	regNumber tmpReg = REG_NA;
4650
4651	// Generate the bounds check if necessary.
4652	if ((node->gtFlags & GTF_INX_RNGCHK) != `0`)
4653	{
4654	// Create a GT_IND(GT_LEA)) tree for the array length access.
4655	GenTreeAddrMode arrLenAddr(base->TypeGet(), base, nullptr, `0`, node->gtLenOffset);
4656	arrLenAddr.gtRegNum = REG_NA;
4657	arrLenAddr.SetContained();
4658
4659	GenTreeIndir arrLen = indirForm(TYP_INT, &arrLenAddr);
4660
4661	#ifdef _TARGET_64BIT_
4662	// The CLI Spec allows an array to be indexed by either an int32 or a native int. In the case that the index
4663	// is a native int on a 64-bit platform, we will need to widen the array length and the compare.
4664	if (index->TypeGet() == TYP_I_IMPL)
4665	{
4666	// Load the array length into a register.
4667	tmpReg = node->GetSingleTempReg();
4668	arrLen.gtRegNum = tmpReg;
4669	arrLen.ClearContained();
4670	getEmitter()->emitInsLoadInd(ins_Load(TYP_INT), EA_4BYTE, arrLen.gtRegNum, &arrLen);
4671	}
4672	else
4673	#endif
4674	{
4675	assert(varTypeIsIntegral(index->TypeGet()));
4676
4677	arrLen.gtRegNum = REG_NA;
4678	arrLen.SetContained();
4679	}
4680
4681	// Generate the range check.
4682	getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_I_IMPL), index, &arrLen);
4683	genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL, node->gtIndRngFailBB);
4684	}
4685
4686	// Compute the address of the array element.
4687	switch (node->gtElemSize)
4688	{
4689	case `1`:
4690	case `2`:
4691	case `4`:
4692	case `8`:
4693	getEmitter()->emitIns_R_ARX(INS_lea, emitTypeSize(node), node->gtRegNum, base->gtRegNum, index->gtRegNum,
4694	node->gtElemSize, static_cast<int>(node->gtElemOffset));
4695	break;
4696
4697	default:
4698	{
4699	// Multiply the index by the element size.
4700	//
4701	// TODO-CQ: this should really just use `imul index, index, #gtElemSize`
4702	tmpReg = (tmpReg == REG_NA) ? node->GetSingleTempReg() : tmpReg;
4703	CodeGen::genSetRegToIcon(tmpReg, (ssize_t)node->gtElemSize, TYP_INT);
4704	inst_RV_RV(INS_imul, tmpReg, index->gtRegNum);
4705	getEmitter()->emitIns_R_ARX(INS_lea, emitTypeSize(node), node->gtRegNum, base->gtRegNum, tmpReg, `1`,
4706	static_cast<int>(node->gtElemOffset));
4707	break;
4708	}
4709	}
4710
4711	gcInfo.gcMarkRegSetNpt(base->gtGetRegMask());
4712
4713	genProduceReg(node);
4714	}
4715
4716	//------------------------------------------------------------------------
4717	// genCodeForIndir: Produce code for a GT_IND node.
4718	//
4719	// Arguments:
4720	// tree - the GT_IND node
4721	//
4722	void CodeGen::genCodeForIndir(GenTreeIndir* tree)
4723	{
4724	assert(tree->OperIs(GT_IND));
4725
4726	#ifdef FEATURE_SIMD
4727	// Handling of Vector3 type values loaded through indirection.
4728	if (tree->TypeGet() == TYP_SIMD12)
4729	{
4730	genLoadIndTypeSIMD12(tree);
4731	return;
4732	}
4733	#endif // FEATURE_SIMD
4734
4735	var_types targetType = tree->TypeGet();
4736	emitter* emit = getEmitter();
4737
4738	GenTree* addr = tree->Addr();
4739	if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
4740	{
4741	noway_assert(EA_ATTR(genTypeSize(targetType)) == EA_PTRSIZE);
4742	emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, tree->gtRegNum, FLD_GLOBAL_FS,
4743	(int)addr->gtIntCon.gtIconVal);
4744	}
4745	else
4746	{
4747	genConsumeAddress(addr);
4748	emit->emitInsLoadInd(ins_Load(targetType), emitTypeSize(tree), tree->gtRegNum, tree);
4749	}
4750
4751	genProduceReg(tree);
4752	}
4753
4754	//------------------------------------------------------------------------
4755	// genRegCopy: Produce code for a GT_COPY node.
4756	//
4757	// Arguments:
4758	// tree - the GT_COPY node
4759	//
4760	// Notes:
4761	// This will copy the register(s) produced by this nodes source, to
4762	// the register(s) allocated to this GT_COPY node.
4763	// It has some special handling for these casess:
4764	// - when the source and target registers are in different register files
4765	// (note that this is not* a conversion).*
4766	// - when the source is a lclVar whose home location is being moved to a new
4767	// register (rather than just being copied for temporary use).
4768	//
4769	void CodeGen::genRegCopy(GenTree* treeNode)
4770	{
4771	assert(treeNode->OperGet() == GT_COPY);
4772	GenTree* op1 = treeNode->gtOp.gtOp1;
4773
4774	if (op1->IsMultiRegNode())
4775	{
4776	genConsumeReg(op1);
4777
4778	GenTreeCopyOrReload* copyTree = treeNode->AsCopyOrReload();
4779	unsigned regCount = treeNode->GetMultiRegCount();
4780
4781	for (unsigned i = `0`; i < regCount; ++i)
4782	{
4783	var_types type = op1->GetRegTypeByIndex(i);
4784	regNumber fromReg = op1->GetRegByIndex(i);
4785	regNumber toReg = copyTree->GetRegNumByIdx(i);
4786
4787	// A Multi-reg GT_COPY node will have a valid reg only for those positions for which a corresponding
4788	// result reg of the multi-reg node needs to be copied.
4789	if (toReg != REG_NA)
4790	{
4791	assert(toReg != fromReg);
4792	inst_RV_RV(ins_Copy(type), toReg, fromReg, type);
4793	}
4794	}
4795	}
4796	else
4797	{
4798	var_types targetType = treeNode->TypeGet();
4799	regNumber targetReg = treeNode->gtRegNum;
4800	assert(targetReg != REG_NA);
4801
4802	// Check whether this node and the node from which we're copying the value have
4803	// different register types. This can happen if (currently iff) we have a SIMD
4804	// vector type that fits in an integer register, in which case it is passed as
4805	// an argument, or returned from a call, in an integer register and must be
4806	// copied if it's in an xmm register.
4807
4808	bool srcFltReg = (varTypeIsFloating(op1) \|\| varTypeIsSIMD(op1));
4809	bool tgtFltReg = (varTypeIsFloating(treeNode) \|\| varTypeIsSIMD(treeNode));
4810	if (srcFltReg != tgtFltReg)
4811	{
4812	instruction ins;
4813	regNumber fpReg;
4814	regNumber intReg;
4815	if (tgtFltReg)
4816	{
4817	ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
4818	fpReg = targetReg;
4819	intReg = op1->gtRegNum;
4820	}
4821	else
4822	{
4823	ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
4824	intReg = targetReg;
4825	fpReg = op1->gtRegNum;
4826	}
4827	inst_RV_RV(ins, fpReg, intReg, targetType);
4828	}
4829	else
4830	{
4831	inst_RV_RV(ins_Copy(targetType), targetReg, genConsumeReg(op1), targetType);
4832	}
4833
4834	if (op1->IsLocal())
4835	{
4836	// The lclVar will never be a def.
4837	// If it is a last use, the lclVar will be killed by genConsumeReg(), as usual, and genProduceReg will
4838	// appropriately set the gcInfo for the copied value.
4839	// If not, there are two cases we need to handle:
4840	// - If this is a TEMPORARY copy (indicated by the GTF_VAR_DEATH flag) the variable
4841	// will remain live in its original register.
4842	// genProduceReg() will appropriately set the gcInfo for the copied value,
4843	// and genConsumeReg will reset it.
4844	// - Otherwise, we need to update register info for the lclVar.
4845
4846	GenTreeLclVarCommon* lcl = op1->AsLclVarCommon();
4847	assert((lcl->gtFlags & GTF_VAR_DEF) == `0`);
4848
4849	if ((lcl->gtFlags & GTF_VAR_DEATH) == `0` && (treeNode->gtFlags & GTF_VAR_DEATH) == `0`)
4850	{
4851	LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
4852
4853	// If we didn't just spill it (in genConsumeReg, above), then update the register info
4854	if (varDsc->lvRegNum != REG_STK)
4855	{
4856	// The old location is dying
4857	genUpdateRegLife(varDsc, /isBorn/ false, /isDying/ true DEBUGARG(op1));
4858
4859	gcInfo.gcMarkRegSetNpt(genRegMask(op1->gtRegNum));
4860
4861	genUpdateVarReg(varDsc, treeNode);
4862
4863	// The new location is going live
4864	genUpdateRegLife(varDsc, /isBorn/ true, /isDying/ false DEBUGARG(treeNode));
4865	}
4866	}
4867	}
4868	}
4869
4870	genProduceReg(treeNode);
4871	}
4872
4873	//------------------------------------------------------------------------
4874	// genCodeForStoreInd: Produce code for a GT_STOREIND node.
4875	//
4876	// Arguments:
4877	// tree - the GT_STOREIND node
4878	//
4879	void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
4880	{
4881	assert(tree->OperIs(GT_STOREIND));
4882
4883	#ifdef FEATURE_SIMD
4884	// Storing Vector3 of size 12 bytes through indirection
4885	if (tree->TypeGet() == TYP_SIMD12)
4886	{
4887	genStoreIndTypeSIMD12(tree);
4888	return;
4889	}
4890	#endif // FEATURE_SIMD
4891
4892	GenTree* data = tree->Data();
4893	GenTree* addr = tree->Addr();
4894	var_types targetType = tree->TypeGet();
4895
4896	assert(!varTypeIsFloating(targetType) \|\| (targetType == data->TypeGet()));
4897
4898	GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(tree, data);
4899	if (writeBarrierForm != GCInfo::WBF_NoBarrier)
4900	{
4901	// data and addr must be in registers.
4902	// Consume both registers so that any copies of interfering registers are taken care of.
4903	genConsumeOperands(tree);
4904
4905	if (genEmitOptimizedGCWriteBarrier(writeBarrierForm, addr, data))
4906	{
4907	return;
4908	}
4909
4910	// At this point, we should not have any interference.
4911	// That is, 'data' must not be in REG_ARG_0, as that is where 'addr' must go.
4912	noway_assert(data->gtRegNum != REG_ARG_0);
4913
4914	// addr goes in REG_ARG_0
4915	genCopyRegIfNeeded(addr, REG_ARG_0);
4916
4917	// data goes in REG_ARG_1
4918	genCopyRegIfNeeded(data, REG_ARG_1);
4919
4920	genGCWriteBarrier(tree, writeBarrierForm);
4921	}
4922	else
4923	{
4924	bool dataIsUnary = false;
4925	bool isRMWMemoryOp = tree->IsRMWMemoryOp();
4926	GenTree* rmwSrc = nullptr;
4927
4928	// We must consume the operands in the proper execution order, so that liveness is
4929	// updated appropriately.
4930	genConsumeAddress(addr);
4931
4932	// If tree represents a RMW memory op then its data is a non-leaf node marked as contained
4933	// and non-indir operand of data is the source of RMW memory op.
4934	if (isRMWMemoryOp)
4935	{
4936	assert(data->isContained() && !data->OperIsLeaf());
4937
4938	GenTree* rmwDst = nullptr;
4939
4940	dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != `0`);
4941	if (!dataIsUnary)
4942	{
4943	if (tree->IsRMWDstOp1())
4944	{
4945	rmwDst = data->gtGetOp1();
4946	rmwSrc = data->gtGetOp2();
4947	}
4948	else
4949	{
4950	assert(tree->IsRMWDstOp2());
4951	rmwDst = data->gtGetOp2();
4952	rmwSrc = data->gtGetOp1();
4953	}
4954
4955	genConsumeRegs(rmwSrc);
4956	}
4957	else
4958	{
4959	// (p) = oper (p): Here addr = p, rmwsrc=rmwDst = (p) i.e. GT_IND(p)*
4960	// For unary RMW ops, src and dst of RMW memory op is the same. Lower
4961	// clears operand counts on rmwSrc and we don't need to perform a
4962	// genConsumeReg() on it.
4963	assert(tree->IsRMWDstOp1());
4964	rmwSrc = data->gtGetOp1();
4965	rmwDst = data->gtGetOp1();
4966	assert(rmwSrc->isUsedFromMemory());
4967	}
4968
4969	assert(rmwSrc != nullptr);
4970	assert(rmwDst != nullptr);
4971	assert(Lowering::IndirsAreEquivalent(rmwDst, tree));
4972	}
4973	else
4974	{
4975	genConsumeRegs(data);
4976	}
4977
4978	if (isRMWMemoryOp)
4979	{
4980	if (dataIsUnary)
4981	{
4982	// generate code for unary RMW memory ops like neg/not
4983	getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree), tree);
4984	}
4985	else
4986	{
4987	if (data->OperIsShiftOrRotate())
4988	{
4989	// Generate code for shift RMW memory ops.
4990	// The data address needs to be op1 (it must be [addr] = [addr] <shift> <amount>, not [addr] =
4991	// <amount> <shift> [addr]).
4992	assert(tree->IsRMWDstOp1());
4993	assert(rmwSrc == data->gtGetOp2());
4994	genCodeForShiftRMW(tree);
4995	}
4996	else if (data->OperGet() == GT_ADD && (rmwSrc->IsIntegralConst(`1`) \|\| rmwSrc->IsIntegralConst(-`1`)))
4997	{
4998	// Generate "inc/dec [mem]" instead of "add/sub [mem], 1".
4999	//
5000	// Notes:
5001	// 1) Global morph transforms GT_SUB(x, +/-1) into GT_ADD(x, -/+1).
5002	// 2) TODO-AMD64: Debugger routine NativeWalker::Decode() runs into
5003	// an assert while decoding ModR/M byte of "inc dword ptr [rax]".
5004	// It is not clear whether Decode() can handle all possible
5005	// addr modes with inc/dec. For this reason, inc/dec [mem]
5006	// is not generated while generating debuggable code. Update
5007	// the above if condition once Decode() routine is fixed.
5008	assert(rmwSrc->isContainedIntOrIImmed());
5009	instruction ins = rmwSrc->IsIntegralConst(`1`) ? INS_inc : INS_dec;
5010	getEmitter()->emitInsRMW(ins, emitTypeSize(tree), tree);
5011	}
5012	else
5013	{
5014	// generate code for remaining binary RMW memory ops like add/sub/and/or/xor
5015	getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree),
5016	tree, rmwSrc);
5017	}
5018	}
5019	}
5020	else
5021	{
5022	getEmitter()->emitInsStoreInd(ins_Store(data->TypeGet()), emitTypeSize(tree), tree);
5023	}
5024	}
5025	}
5026
5027	//------------------------------------------------------------------------
5028	// genCodeForSwap: Produce code for a GT_SWAP node.
5029	//
5030	// Arguments:
5031	// tree - the GT_SWAP node
5032	//
5033	void CodeGen::genCodeForSwap(GenTreeOp* tree)
5034	{
5035	assert(tree->OperIs(GT_SWAP));
5036
5037	// Swap is only supported for lclVar operands that are enregistered
5038	// We do not consume or produce any registers. Both operands remain enregistered.
5039	// However, the gc-ness may change.
5040	assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
5041
5042	GenTreeLclVarCommon* lcl1 = tree->gtOp1->AsLclVarCommon();
5043	LclVarDsc* varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
5044	var_types type1 = varDsc1->TypeGet();
5045	GenTreeLclVarCommon* lcl2 = tree->gtOp2->AsLclVarCommon();
5046	LclVarDsc* varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
5047	var_types type2 = varDsc2->TypeGet();
5048
5049	// We must have both int or both fp regs
5050	assert(!varTypeIsFloating(type1) \|\| varTypeIsFloating(type2));
5051
5052	// FP swap is not yet implemented (and should have NYI'd in LSRA)
5053	assert(!varTypeIsFloating(type1));
5054
5055	regNumber oldOp1Reg = lcl1->gtRegNum;
5056	regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
5057	regNumber oldOp2Reg = lcl2->gtRegNum;
5058	regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
5059
5060	// We don't call genUpdateVarReg because we don't have a tree node with the new register.
5061	varDsc1->lvRegNum = oldOp2Reg;
5062	varDsc2->lvRegNum = oldOp1Reg;
5063
5064	// Do the xchg
5065	emitAttr size = EA_PTRSIZE;
5066	if (varTypeGCtype(type1) != varTypeGCtype(type2))
5067	{
5068	// If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
5069	// Otherwise it will leave them alone, which is correct if they have the same GC-ness.
5070	size = EA_GCREF;
5071	}
5072	inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
5073
5074	// Update the gcInfo.
5075	// Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
5076	gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask \| oldOp2RegMask);
5077	gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask \| oldOp2RegMask);
5078
5079	// gcMarkRegPtrVal will do the appropriate thing for non-gc types.
5080	// It will also dump the updates.
5081	gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
5082	gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
5083	}
5084
5085	//------------------------------------------------------------------------
5086	// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
5087	// helper functions.
5088	//
5089	// Arguments:
5090	// writeBarrierForm - the write barrier form to use
5091	// addr - the address at which to do the store
5092	// data - the data to store
5093	//
5094	// Return Value:
5095	// true if an optimized write barrier form was used, false if not. If this
5096	// function returns false, the caller must emit a "standard" write barrier.
5097
5098	bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
5099	{
5100	assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
5101
5102	#if defined(_TARGET_X86_) && NOGC_WRITE_BARRIERS
5103	if (!genUseOptimizedWriteBarriers(writeBarrierForm))
5104	{
5105	return false;
5106	}
5107
5108	const static int regToHelper[`2`][`8`] = {
5109	// If the target is known to be in managed memory
5110	{
5111	CORINFO_HELP_ASSIGN_REF_EAX, // EAX
5112	CORINFO_HELP_ASSIGN_REF_ECX, // ECX
5113	-`1`, // EDX (always the target address)
5114	CORINFO_HELP_ASSIGN_REF_EBX, // EBX
5115	-`1`, // ESP
5116	CORINFO_HELP_ASSIGN_REF_EBP, // EBP
5117	CORINFO_HELP_ASSIGN_REF_ESI, // ESI
5118	CORINFO_HELP_ASSIGN_REF_EDI, // EDI
5119	},
5120
5121	// Don't know if the target is in managed memory
5122	{
5123	CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, // EAX
5124	CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, // ECX
5125	-`1`, // EDX (always the target address)
5126	CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, // EBX
5127	-`1`, // ESP
5128	CORINFO_HELP_CHECKED_ASSIGN_REF_EBP, // EBP
5129	CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, // ESI
5130	CORINFO_HELP_CHECKED_ASSIGN_REF_EDI, // EDI
5131	},
5132	};
5133
5134	noway_assert(regToHelper[`0`][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
5135	noway_assert(regToHelper[`0`][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
5136	noway_assert(regToHelper[`0`][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
5137	noway_assert(regToHelper[`0`][REG_ESP] == -`1`);
5138	noway_assert(regToHelper[`0`][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
5139	noway_assert(regToHelper[`0`][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
5140	noway_assert(regToHelper[`0`][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
5141
5142	noway_assert(regToHelper[`1`][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
5143	noway_assert(regToHelper[`1`][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
5144	noway_assert(regToHelper[`1`][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
5145	noway_assert(regToHelper[`1`][REG_ESP] == -`1`);
5146	noway_assert(regToHelper[`1`][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
5147	noway_assert(regToHelper[`1`][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
5148	noway_assert(regToHelper[`1`][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
5149
5150	regNumber reg = data->gtRegNum;
5151	noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
5152
5153	// Generate the following code:
5154	// lea edx, addr
5155	// call write_barrier_helper_reg
5156
5157	// addr goes in REG_ARG_0
5158	genCopyRegIfNeeded(addr, REG_WRITE_BARRIER);
5159
5160	unsigned tgtAnywhere = `0`;
5161	if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
5162	{
5163	tgtAnywhere = `1`;
5164	}
5165
5166	// We might want to call a modified version of genGCWriteBarrier() to get the benefit of
5167	// the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
5168	// with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
5169
5170	genEmitHelperCall(regToHelper[tgtAnywhere][reg],
5171	`0`, // argSize
5172	EA_PTRSIZE); // retSize
5173
5174	return true;
5175	#else // !defined(_TARGET_X86_) \|\| !NOGC_WRITE_BARRIERS
5176	return false;
5177	#endif // !defined(_TARGET_X86_) \|\| !NOGC_WRITE_BARRIERS
5178	}
5179
5180	// Produce code for a GT_CALL node
5181	void CodeGen::genCallInstruction(GenTreeCall* call)
5182	{
5183	genAlignStackBeforeCall(call);
5184
5185	gtCallTypes callType = (gtCallTypes)call->gtCallType;
5186
5187	IL_OFFSETX ilOffset = BAD_IL_OFFSET;
5188
5189	// all virtuals should have been expanded into a control expression
5190	assert(!call->IsVirtual() \|\| call->gtControlExpr \|\| call->gtCallAddr);
5191
5192	// Insert a GS check if necessary
5193	if (call->IsTailCallViaHelper())
5194	{
5195	if (compiler->getNeedsGSSecurityCookie())
5196	{
5197	#if FEATURE_FIXED_OUT_ARGS
5198	// If either of the conditions below is true, we will need a temporary register in order to perform the GS
5199	// cookie check. When FEATURE_FIXED_OUT_ARGS is disabled, we save and restore the temporary register using
5200	// push/pop. When FEATURE_FIXED_OUT_ARGS is enabled, however, we need an alternative solution. For now,
5201	// though, the tail prefix is ignored on all platforms that use fixed out args, so we should never hit this
5202	// case.
5203	assert(compiler->gsGlobalSecurityCookieAddr == nullptr);
5204	assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
5205	#endif
5206	genEmitGSCookieCheck(true);
5207	}
5208	}
5209
5210	// Consume all the arg regs
5211	for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
5212	{
5213	assert(list->OperIsList());
5214
5215	GenTree* argNode = list->Current();
5216
5217	fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode->gtSkipReloadOrCopy());
5218	assert(curArgTabEntry);
5219
5220	if (curArgTabEntry->regNum == REG_STK)
5221	{
5222	continue;
5223	}
5224
5225	#ifdef UNIX_AMD64_ABI
5226	// Deal with multi register passed struct args.
5227	if (argNode->OperGet() == GT_FIELD_LIST)
5228	{
5229	GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
5230	unsigned iterationNum = `0`;
5231	for (; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), iterationNum++)
5232	{
5233	GenTree* putArgRegNode = fieldListPtr->gtOp.gtOp1;
5234	assert(putArgRegNode->gtOper == GT_PUTARG_REG);
5235	regNumber argReg = REG_NA;
5236
5237	if (iterationNum == `0`)
5238	{
5239	argReg = curArgTabEntry->regNum;
5240	}
5241	else
5242	{
5243	assert(iterationNum == `1`);
5244	argReg = curArgTabEntry->otherRegNum;
5245	}
5246
5247	genConsumeReg(putArgRegNode);
5248
5249	// Validate the putArgRegNode has the right type.
5250	assert(varTypeIsFloating(putArgRegNode->TypeGet()) == genIsValidFloatReg(argReg));
5251	if (putArgRegNode->gtRegNum != argReg)
5252	{
5253	inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), false), argReg, putArgRegNode->gtRegNum);
5254	}
5255	}
5256	}
5257	else
5258	#endif // UNIX_AMD64_ABI
5259	{
5260	regNumber argReg = curArgTabEntry->regNum;
5261	genConsumeReg(argNode);
5262	if (argNode->gtRegNum != argReg)
5263	{
5264	inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), false), argReg, argNode->gtRegNum);
5265	}
5266	}
5267
5268	#if FEATURE_VARARG
5269	// In the case of a varargs call,
5270	// the ABI dictates that if we have floating point args,
5271	// we must pass the enregistered arguments in both the
5272	// integer and floating point registers so, let's do that.
5273	if (call->IsVarargs() && varTypeIsFloating(argNode))
5274	{
5275	regNumber targetReg = compiler->getCallArgIntRegister(argNode->gtRegNum);
5276	instruction ins = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
5277	inst_RV_RV(ins, argNode->gtRegNum, targetReg);
5278	}
5279	#endif // FEATURE_VARARG
5280	}
5281
5282	#if defined(_TARGET_X86_) \|\| defined(UNIX_AMD64_ABI)
5283	// The call will pop its arguments.
5284	// for each putarg_stk:
5285	ssize_t stackArgBytes = `0`;
5286	GenTree* args = call->gtCallArgs;
5287	while (args)
5288	{
5289	GenTree* arg = args->gtOp.gtOp1;
5290	if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
5291	{
5292	if (arg->OperGet() == GT_PUTARG_STK)
5293	{
5294	GenTree* source = arg->gtOp.gtOp1;
5295	unsigned size = arg->AsPutArgStk()->getArgSize();
5296	stackArgBytes += size;
5297	#ifdef DEBUG
5298	fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
5299	assert(curArgTabEntry);
5300	assert(size == (curArgTabEntry->numSlots * TARGET_POINTER_SIZE));
5301	#ifdef FEATURE_PUT_STRUCT_ARG_STK
5302	if (source->TypeGet() == TYP_STRUCT)
5303	{
5304	GenTreeObj* obj = source->AsObj();
5305	unsigned argBytes = roundUp(obj->gtBlkSize, TARGET_POINTER_SIZE);
5306	assert((curArgTabEntry->numSlots * TARGET_POINTER_SIZE) == argBytes);
5307	}
5308	#endif // FEATURE_PUT_STRUCT_ARG_STK
5309	#endif // DEBUG
5310	}
5311	}
5312	args = args->gtOp.gtOp2;
5313	}
5314	#endif // defined(_TARGET_X86_) \|\| defined(UNIX_AMD64_ABI)
5315
5316	// Insert a null check on "this" pointer if asked.
5317	if (call->NeedsNullCheck())
5318	{
5319	const regNumber regThis = genGetThisArgReg(call);
5320	getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, regThis, regThis, `0`);
5321	}
5322
5323	// Either gtControlExpr != null or gtCallAddr != null or it is a direct non-virtual call to a user or helper method.
5324	CORINFO_METHOD_HANDLE methHnd;
5325	GenTree* target = call->gtControlExpr;
5326	if (callType == CT_INDIRECT)
5327	{
5328	assert(target == nullptr);
5329	target = call->gtCallAddr;
5330	methHnd = nullptr;
5331	}
5332	else
5333	{
5334	methHnd = call->gtCallMethHnd;
5335	}
5336
5337	CORINFO_SIG_INFO* sigInfo = nullptr;
5338	#ifdef DEBUG
5339	// Pass the call signature information down into the emitter so the emitter can associate
5340	// native call sites with the signatures they were generated from.
5341	if (callType != CT_HELPER)
5342	{
5343	sigInfo = call->callSig;
5344	}
5345	#endif // DEBUG
5346
5347	// If fast tail call, then we are done. In this case we setup the args (both reg args
5348	// and stack args in incoming arg area) and call target in rax. Epilog sequence would
5349	// generate "jmp rax".
5350	if (call->IsFastTailCall())
5351	{
5352	// Don't support fast tail calling JIT helpers
5353	assert(callType != CT_HELPER);
5354
5355	// Fast tail calls materialize call target either in gtControlExpr or in gtCallAddr.
5356	assert(target != nullptr);
5357
5358	genConsumeReg(target);
5359	genCopyRegIfNeeded(target, REG_RAX);
5360	return;
5361	}
5362
5363	// For a pinvoke to unmanged code we emit a label to clear
5364	// the GC pointer state before the callsite.
5365	// We can't utilize the typical lazy killing of GC pointers
5366	// at (or inside) the callsite.
5367	if (compiler->killGCRefs(call))
5368	{
5369	genDefineTempLabel(genCreateTempLabel());
5370	}
5371
5372	// Determine return value size(s).
5373	ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
5374	emitAttr retSize = EA_PTRSIZE;
5375	emitAttr secondRetSize = EA_UNKNOWN;
5376
5377	if (call->HasMultiRegRetVal())
5378	{
5379	retSize = emitTypeSize(retTypeDesc->GetReturnRegType(`0`));
5380	secondRetSize = emitTypeSize(retTypeDesc->GetReturnRegType(`1`));
5381	}
5382	else
5383	{
5384	assert(!varTypeIsStruct(call));
5385
5386	if (call->gtType == TYP_REF)
5387	{
5388	retSize = EA_GCREF;
5389	}
5390	else if (call->gtType == TYP_BYREF)
5391	{
5392	retSize = EA_BYREF;
5393	}
5394	}
5395
5396	#if defined(DEBUG) && defined(_TARGET_X86_)
5397	// Store the stack pointer so we can check it after the call.
5398	if (compiler->opts.compStackCheckOnCall && call->gtCallType == CT_USER_FUNC)
5399	{
5400	noway_assert(compiler->lvaCallSpCheck != `0xCCCCCCCC` &&
5401	compiler->lvaTable[compiler->lvaCallSpCheck].lvDoNotEnregister &&
5402	compiler->lvaTable[compiler->lvaCallSpCheck].lvOnFrame);
5403	getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaCallSpCheck, `0`);
5404	}
5405	#endif // defined(DEBUG) && defined(_TARGET_X86_)
5406
5407	bool fPossibleSyncHelperCall = false;
5408	CorInfoHelpFunc helperNum = CORINFO_HELP_UNDEF;
5409
5410	// We need to propagate the IL offset information to the call instruction, so we can emit
5411	// an IL to native mapping record for the call, to support managed return value debugging.
5412	// We don't want tail call helper calls that were converted from normal calls to get a record,
5413	// so we skip this hash table lookup logic in that case.
5414	if (compiler->opts.compDbgInfo && compiler->genCallSite2ILOffsetMap != nullptr && !call->IsTailCall())
5415	{
5416	(void)compiler->genCallSite2ILOffsetMap->Lookup(call, &ilOffset);
5417	}
5418
5419	#if defined(_TARGET_X86_)
5420	bool fCallerPop = call->CallerPop();
5421
5422	#ifdef UNIX_X86_ABI
5423	if (!call->IsUnmanaged())
5424	{
5425	CorInfoCallConv callConv = CORINFO_CALLCONV_DEFAULT;
5426
5427	if ((callType != CT_HELPER) && call->callSig)
5428	{
5429	callConv = call->callSig->callConv;
5430	}
5431
5432	fCallerPop \|= IsCallerPop(callConv);
5433	}
5434	#endif // UNIX_X86_ABI
5435
5436	// If the callee pops the arguments, we pass a positive value as the argSize, and the emitter will
5437	// adjust its stack level accordingly.
5438	// If the caller needs to explicitly pop its arguments, we must pass a negative value, and then do the
5439	// pop when we're done.
5440	ssize_t argSizeForEmitter = stackArgBytes;
5441	if (fCallerPop)
5442	{
5443	argSizeForEmitter = -stackArgBytes;
5444	}
5445	#endif // defined(_TARGET_X86_)
5446
5447	// When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
5448	// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
5449	// transition penalty, assuming the user function contains legacy SSE instruction.
5450	// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
5451	// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
5452	// when there's preceding 256-bit AVX to legacy SSE transition penalty.
5453	if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
5454	{
5455	assert(compiler->canUseVexEncoding());
5456	instGen(INS_vzeroupper);
5457	}
5458
5459	if (target != nullptr)
5460	{
5461	#ifdef _TARGET_X86_
5462	if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
5463	{
5464	// On x86, we need to generate a very specific pattern for indirect VSD calls:
5465	//
5466	// 3-byte nop
5467	// call dword ptr [eax]
5468	//
5469	// Where EAX is also used as an argument to the stub dispatch helper. Make
5470	// sure that the call target address is computed into EAX in this case.
5471
5472	assert(compiler->virtualStubParamInfo->GetReg() == REG_VIRTUAL_STUB_TARGET);
5473
5474	assert(target->isContainedIndir());
5475	assert(target->OperGet() == GT_IND);
5476
5477	GenTree* addr = target->AsIndir()->Addr();
5478	assert(addr->isUsedFromReg());
5479
5480	genConsumeReg(addr);
5481	genCopyRegIfNeeded(addr, REG_VIRTUAL_STUB_TARGET);
5482
5483	getEmitter()->emitIns_Nop(`3`);
5484
5485	// clang-format off
5486	getEmitter()->emitIns_Call(emitter::EmitCallType(emitter::EC_INDIR_ARD),
5487	methHnd,
5488	INDEBUG_LDISASM_COMMA(sigInfo)
5489	nullptr,
5490	argSizeForEmitter,
5491	retSize
5492	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5493	gcInfo.gcVarPtrSetCur,
5494	gcInfo.gcRegGCrefSetCur,
5495	gcInfo.gcRegByrefSetCur,
5496	ilOffset, REG_VIRTUAL_STUB_TARGET, REG_NA, `1`, `0`);
5497	// clang-format on
5498	}
5499	else
5500	#endif
5501	if (target->isContainedIndir())
5502	{
5503	if (target->AsIndir()->HasBase() && target->AsIndir()->Base()->isContainedIntOrIImmed())
5504	{
5505	// Note that if gtControlExpr is an indir of an absolute address, we mark it as
5506	// contained only if it can be encoded as PC-relative offset.
5507	assert(target->AsIndir()->Base()->AsIntConCommon()->FitsInAddrBase(compiler));
5508
5509	// clang-format off
5510	genEmitCall(emitter::EC_FUNC_TOKEN_INDIR,
5511	methHnd,
5512	INDEBUG_LDISASM_COMMA(sigInfo)
5513	(void*) target->AsIndir()->Base()->AsIntConCommon()->IconValue()
5514	X86_ARG(argSizeForEmitter),
5515	retSize
5516	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5517	ilOffset);
5518	// clang-format on
5519	}
5520	else
5521	{
5522	// clang-format off
5523	genEmitCall(emitter::EC_INDIR_ARD,
5524	methHnd,
5525	INDEBUG_LDISASM_COMMA(sigInfo)
5526	target->AsIndir()
5527	X86_ARG(argSizeForEmitter),
5528	retSize
5529	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5530	ilOffset);
5531	// clang-format on
5532	}
5533	}
5534	else
5535	{
5536	// We have already generated code for gtControlExpr evaluating it into a register.
5537	// We just need to emit "call reg" in this case.
5538	assert(genIsValidIntReg(target->gtRegNum));
5539
5540	// clang-format off
5541	genEmitCall(emitter::EC_INDIR_R,
5542	methHnd,
5543	INDEBUG_LDISASM_COMMA(sigInfo)
5544	nullptr // addr
5545	X86_ARG(argSizeForEmitter),
5546	retSize
5547	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5548	ilOffset,
5549	genConsumeReg(target));
5550	// clang-format on
5551	}
5552	}
5553	#ifdef FEATURE_READYTORUN_COMPILER
5554	else if (call->gtEntryPoint.addr != nullptr)
5555	{
5556	// clang-format off
5557	genEmitCall((call->gtEntryPoint.accessType == IAT_VALUE) ? emitter::EC_FUNC_TOKEN
5558	: emitter::EC_FUNC_TOKEN_INDIR,
5559	methHnd,
5560	INDEBUG_LDISASM_COMMA(sigInfo)
5561	(void*) call->gtEntryPoint.addr
5562	X86_ARG(argSizeForEmitter),
5563	retSize
5564	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5565	ilOffset);
5566	// clang-format on
5567	}
5568	#endif
5569	else
5570	{
5571	// Generate a direct call to a non-virtual user defined or helper method
5572	assert(callType == CT_HELPER \|\| callType == CT_USER_FUNC);
5573
5574	void* addr = nullptr;
5575	if (callType == CT_HELPER)
5576	{
5577	// Direct call to a helper method.
5578	helperNum = compiler->eeGetHelperNum(methHnd);
5579	noway_assert(helperNum != CORINFO_HELP_UNDEF);
5580
5581	void* pAddr = nullptr;
5582	addr = compiler->compGetHelperFtn(helperNum, (void**)&pAddr);
5583	assert(pAddr == nullptr);
5584
5585	// tracking of region protected by the monitor in synchronized methods
5586	if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
5587	{
5588	fPossibleSyncHelperCall = true;
5589	}
5590	}
5591	else
5592	{
5593	// Direct call to a non-virtual user function.
5594	addr = call->gtDirectCallAddress;
5595	}
5596
5597	assert(addr != nullptr);
5598
5599	// Non-virtual direct calls to known addresses
5600
5601	// clang-format off
5602	genEmitCall(emitter::EC_FUNC_TOKEN,
5603	methHnd,
5604	INDEBUG_LDISASM_COMMA(sigInfo)
5605	addr
5606	X86_ARG(argSizeForEmitter),
5607	retSize
5608	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
5609	ilOffset);
5610	// clang-format on
5611	}
5612
5613	// if it was a pinvoke we may have needed to get the address of a label
5614	if (genPendingCallLabel)
5615	{
5616	assert(call->IsUnmanaged());
5617	genDefineTempLabel(genPendingCallLabel);
5618	genPendingCallLabel = nullptr;
5619	}
5620
5621	// Update GC info:
5622	// All Callee arg registers are trashed and no longer contain any GC pointers.
5623	// TODO-XArch-Bug?: As a matter of fact shouldn't we be killing all of callee trashed regs here?
5624	// For now we will assert that other than arg regs gc ref/byref set doesn't contain any other
5625	// registers from RBM_CALLEE_TRASH.
5626	assert((gcInfo.gcRegGCrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == `0`);
5627	assert((gcInfo.gcRegByrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == `0`);
5628	gcInfo.gcRegGCrefSetCur &= ~RBM_ARG_REGS;
5629	gcInfo.gcRegByrefSetCur &= ~RBM_ARG_REGS;
5630
5631	var_types returnType = call->TypeGet();
5632	if (returnType != TYP_VOID)
5633	{
5634	#ifdef _TARGET_X86_
5635	if (varTypeIsFloating(returnType))
5636	{
5637	// Spill the value from the fp stack.
5638	// Then, load it into the target register.
5639	call->gtFlags \|= GTF_SPILL;
5640	regSet.rsSpillFPStack(call);
5641	call->gtFlags \|= GTF_SPILLED;
5642	call->gtFlags &= ~GTF_SPILL;
5643	}
5644	else
5645	#endif // _TARGET_X86_
5646	{
5647	regNumber returnReg;
5648
5649	if (call->HasMultiRegRetVal())
5650	{
5651	assert(retTypeDesc != nullptr);
5652	unsigned regCount = retTypeDesc->GetReturnRegCount();
5653
5654	// If regs allocated to call node are different from ABI return
5655	// regs in which the call has returned its result, move the result
5656	// to regs allocated to call node.
5657	for (unsigned i = `0`; i < regCount; ++i)
5658	{
5659	var_types regType = retTypeDesc->GetReturnRegType(i);
5660	returnReg = retTypeDesc->GetABIReturnReg(i);
5661	regNumber allocatedReg = call->GetRegNumByIdx(i);
5662	if (returnReg != allocatedReg)
5663	{
5664	inst_RV_RV(ins_Copy(regType), allocatedReg, returnReg, regType);
5665	}
5666	}
5667
5668	#ifdef FEATURE_SIMD
5669	// A Vector3 return value is stored in xmm0 and xmm1.
5670	// RyuJIT assumes that the upper unused bits of xmm1 are cleared but
5671	// the native compiler doesn't guarantee it.
5672	if (returnType == TYP_SIMD12)
5673	{
5674	returnReg = retTypeDesc->GetABIReturnReg(`1`);
5675	// Clear the upper 32 bits by two shift instructions.
5676	// retReg = retReg << 96
5677	// retReg = retReg >> 96
5678	getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, `12`);
5679	getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, `12`);
5680	}
5681	#endif // FEATURE_SIMD
5682	}
5683	else
5684	{
5685	#ifdef _TARGET_X86_
5686	if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
5687	{
5688	// The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
5689	// TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
5690	// correct argument registers.
5691	returnReg = REG_PINVOKE_TCB;
5692	}
5693	else
5694	#endif // _TARGET_X86_
5695	if (varTypeIsFloating(returnType))
5696	{
5697	returnReg = REG_FLOATRET;
5698	}
5699	else
5700	{
5701	returnReg = REG_INTRET;
5702	}
5703
5704	if (call->gtRegNum != returnReg)
5705	{
5706	inst_RV_RV(ins_Copy(returnType), call->gtRegNum, returnReg, returnType);
5707	}
5708	}
5709
5710	genProduceReg(call);
5711	}
5712	}
5713
5714	// If there is nothing next, that means the result is thrown away, so this value is not live.
5715	// However, for minopts or debuggable code, we keep it live to support managed return value debugging.
5716	if ((call->gtNext == nullptr) && compiler->opts.OptimizationEnabled())
5717	{
5718	gcInfo.gcMarkRegSetNpt(RBM_INTRET);
5719	}
5720
5721	#if defined(DEBUG) && defined(_TARGET_X86_)
5722	if (compiler->opts.compStackCheckOnCall && call->gtCallType == CT_USER_FUNC)
5723	{
5724	noway_assert(compiler->lvaCallSpCheck != `0xCCCCCCCC` &&
5725	compiler->lvaTable[compiler->lvaCallSpCheck].lvDoNotEnregister &&
5726	compiler->lvaTable[compiler->lvaCallSpCheck].lvOnFrame);
5727	if (!fCallerPop && (stackArgBytes != `0`))
5728	{
5729	// ECX is trashed, so can be used to compute the expected SP. We saved the value of SP
5730	// after pushing all the stack arguments, but the caller popped the arguments, so we need
5731	// to do some math to figure a good comparison.
5732	getEmitter()->emitIns_R_R(INS_mov, EA_4BYTE, REG_ARG_0, REG_SPBASE);
5733	getEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_ARG_0, stackArgBytes);
5734	getEmitter()->emitIns_S_R(INS_cmp, EA_4BYTE, REG_ARG_0, compiler->lvaCallSpCheck, `0`);
5735	}
5736	else
5737	{
5738	getEmitter()->emitIns_S_R(INS_cmp, EA_4BYTE, REG_SPBASE, compiler->lvaCallSpCheck, `0`);
5739	}
5740
5741	BasicBlock* sp_check = genCreateTempLabel();
5742	getEmitter()->emitIns_J(INS_je, sp_check);
5743	instGen(INS_BREAKPOINT);
5744	genDefineTempLabel(sp_check);
5745	}
5746	#endif // defined(DEBUG) && defined(_TARGET_X86_)
5747
5748	#if !FEATURE_EH_FUNCLETS
5749	//-------------------------------------------------------------------------
5750	// Create a label for tracking of region protected by the monitor in synchronized methods.
5751	// This needs to be here, rather than above where fPossibleSyncHelperCall is set,
5752	// so the GC state vars have been updated before creating the label.
5753
5754	if (fPossibleSyncHelperCall)
5755	{
5756	switch (helperNum)
5757	{
5758	case CORINFO_HELP_MON_ENTER:
5759	case CORINFO_HELP_MON_ENTER_STATIC:
5760	noway_assert(compiler->syncStartEmitCookie == NULL);
5761	compiler->syncStartEmitCookie =
5762	getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
5763	noway_assert(compiler->syncStartEmitCookie != NULL);
5764	break;
5765	case CORINFO_HELP_MON_EXIT:
5766	case CORINFO_HELP_MON_EXIT_STATIC:
5767	noway_assert(compiler->syncEndEmitCookie == NULL);
5768	compiler->syncEndEmitCookie =
5769	getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
5770	noway_assert(compiler->syncEndEmitCookie != NULL);
5771	break;
5772	default:
5773	break;
5774	}
5775	}
5776	#endif // !FEATURE_EH_FUNCLETS
5777
5778	unsigned stackAdjustBias = `0`;
5779
5780	#if defined(_TARGET_X86_)
5781	// Is the caller supposed to pop the arguments?
5782	if (fCallerPop && (stackArgBytes != `0`))
5783	{
5784	stackAdjustBias = stackArgBytes;
5785	}
5786
5787	SubtractStackLevel(stackArgBytes);
5788	#endif // _TARGET_X86_
5789
5790	genRemoveAlignmentAfterCall(call, stackAdjustBias);
5791	}
5792
5793	// Produce code for a GT_JMP node.
5794	// The arguments of the caller needs to be transferred to the callee before exiting caller.
5795	// The actual jump to callee is generated as part of caller epilog sequence.
5796	// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
5797	void CodeGen::genJmpMethod(GenTree* jmp)
5798	{
5799	assert(jmp->OperGet() == GT_JMP);
5800	assert(compiler->compJmpOpUsed);
5801
5802	// If no arguments, nothing to do
5803	if (compiler->info.compArgsCount == `0`)
5804	{
5805	return;
5806	}
5807
5808	// Make sure register arguments are in their initial registers
5809	// and stack arguments are put back as well.
5810	unsigned varNum;
5811	LclVarDsc* varDsc;
5812
5813	// First move any en-registered stack arguments back to the stack.
5814	// At the same time any reg arg not in correct reg is moved back to its stack location.
5815	//
5816	// We are not strictly required to spill reg args that are not in the desired reg for a jmp call
5817	// But that would require us to deal with circularity while moving values around. Spilling
5818	// to stack makes the implementation simple, which is not a bad trade off given Jmp calls
5819	// are not frequent.
5820	for (varNum = `0`; (varNum < compiler->info.compArgsCount); varNum++)
5821	{
5822	varDsc = compiler->lvaTable + varNum;
5823
5824	if (varDsc->lvPromoted)
5825	{
5826	noway_assert(varDsc->lvFieldCnt == `1`); // We only handle one field here
5827
5828	unsigned fieldVarNum = varDsc->lvFieldLclStart;
5829	varDsc = compiler->lvaTable + fieldVarNum;
5830	}
5831	noway_assert(varDsc->lvIsParam);
5832
5833	if (varDsc->lvIsRegArg && (varDsc->lvRegNum != REG_STK))
5834	{
5835	// Skip reg args which are already in its right register for jmp call.
5836	// If not, we will spill such args to their stack locations.
5837	//
5838	// If we need to generate a tail call profiler hook, then spill all
5839	// arg regs to free them up for the callback.
5840	if (!compiler->compIsProfilerHookNeeded() && (varDsc->lvRegNum == varDsc->lvArgReg))
5841	{
5842	continue;
5843	}
5844	}
5845	else if (varDsc->lvRegNum == REG_STK)
5846	{
5847	// Skip args which are currently living in stack.
5848	continue;
5849	}
5850
5851	// If we came here it means either a reg argument not in the right register or
5852	// a stack argument currently living in a register. In either case the following
5853	// assert should hold.
5854	assert(varDsc->lvRegNum != REG_STK);
5855
5856	assert(!varDsc->lvIsStructField \|\| (compiler->lvaTable[varDsc->lvParentLcl].lvFieldCnt == `1`));
5857	var_types storeType = genActualType(varDsc->lvaArgType()); // We own the memory and can use the full move.
5858	getEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), varDsc->lvRegNum, varNum, `0`);
5859
5860	// Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
5861	// Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
5862	// Therefore manually update life of varDsc->lvRegNum.
5863	regMaskTP tempMask = varDsc->lvRegMask();
5864	regSet.RemoveMaskVars(tempMask);
5865	gcInfo.gcMarkRegSetNpt(tempMask);
5866	if (compiler->lvaIsGCTracked(varDsc))
5867	{
5868	#ifdef DEBUG
5869	if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
5870	{
5871	JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
5872	}
5873	else
5874	{
5875	JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
5876	}
5877	#endif // DEBUG
5878
5879	VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5880	}
5881	}
5882
5883	#ifdef PROFILING_SUPPORTED
5884	// At this point all arg regs are free.
5885	// Emit tail call profiler callback.
5886	genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
5887	#endif
5888
5889	// Next move any un-enregistered register arguments back to their register.
5890	regMaskTP fixedIntArgMask = RBM_NONE; // tracks the int arg regs occupying fixed args in case of a vararg method.
5891	unsigned firstArgVarNum = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
5892	for (varNum = `0`; (varNum < compiler->info.compArgsCount); varNum++)
5893	{
5894	varDsc = compiler->lvaTable + varNum;
5895	if (varDsc->lvPromoted)
5896	{
5897	noway_assert(varDsc->lvFieldCnt == `1`); // We only handle one field here
5898
5899	unsigned fieldVarNum = varDsc->lvFieldLclStart;
5900	varDsc = compiler->lvaTable + fieldVarNum;
5901	}
5902	noway_assert(varDsc->lvIsParam);
5903
5904	// Skip if arg not passed in a register.
5905	if (!varDsc->lvIsRegArg)
5906	{
5907	continue;
5908	}
5909
5910	#if defined(UNIX_AMD64_ABI)
5911	if (varTypeIsStruct(varDsc))
5912	{
5913	CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
5914	assert(typeHnd != nullptr);
5915
5916	SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
5917	compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
5918	assert(structDesc.passedInRegisters);
5919
5920	unsigned __int8 offset0 = `0`;
5921	unsigned __int8 offset1 = `0`;
5922	var_types type0 = TYP_UNKNOWN;
5923	var_types type1 = TYP_UNKNOWN;
5924
5925	// Get the eightbyte data
5926	compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
5927
5928	// Move the values into the right registers.
5929	//
5930
5931	// Update varDsc->lvArgReg and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
5932	// argReg is going live. Note that we cannot modify varDsc->lvRegNum and lvOtherArgReg here because another
5933	// basic block may not be expecting it. Therefore manually update life of argReg. Note that GT_JMP marks
5934	// the end of the basic block and after which reg life and gc info will be recomputed for the new block in
5935	// genCodeForBBList().
5936	if (type0 != TYP_UNKNOWN)
5937	{
5938	getEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->lvArgReg, varNum, offset0);
5939	regSet.rsMaskVars \|= genRegMask(varDsc->lvArgReg);
5940	gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
5941	}
5942
5943	if (type1 != TYP_UNKNOWN)
5944	{
5945	getEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->lvOtherArgReg, varNum, offset1);
5946	regSet.rsMaskVars \|= genRegMask(varDsc->lvOtherArgReg);
5947	gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
5948	}
5949
5950	if (varDsc->lvTracked)
5951	{
5952	VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5953	}
5954	}
5955	else
5956	#endif // !defined(UNIX_AMD64_ABI)
5957	{
5958	// Register argument
5959	noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
5960
5961	// Is register argument already in the right register?
5962	// If not load it from its stack location.
5963	var_types loadType = varDsc->lvaArgType();
5964	regNumber argReg = varDsc->lvArgReg; // incoming arg register
5965
5966	if (varDsc->lvRegNum != argReg)
5967	{
5968	assert(genIsValidReg(argReg));
5969	getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, `0`);
5970
5971	// Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
5972	// Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
5973	// Therefore manually update life of argReg. Note that GT_JMP marks the end of the basic block
5974	// and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
5975	regSet.AddMaskVars(genRegMask(argReg));
5976	gcInfo.gcMarkRegPtrVal(argReg, loadType);
5977	if (compiler->lvaIsGCTracked(varDsc))
5978	{
5979	#ifdef DEBUG
5980	if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
5981	{
5982	JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
5983	}
5984	else
5985	{
5986	JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
5987	}
5988	#endif // DEBUG
5989
5990	VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
5991	}
5992	}
5993	}
5994
5995	#if FEATURE_VARARG && defined(_TARGET_AMD64_)
5996	// In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
5997	// register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
5998	// be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
5999	// values on the stack.
6000	if (compiler->info.compIsVarArgs)
6001	{
6002	regNumber intArgReg;
6003	var_types loadType = varDsc->lvaArgType();
6004	regNumber argReg = varDsc->lvArgReg; // incoming arg register
6005
6006	if (varTypeIsFloating(loadType))
6007	{
6008	intArgReg = compiler->getCallArgIntRegister(argReg);
6009	instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
6010	inst_RV_RV(ins, argReg, intArgReg, loadType);
6011	}
6012	else
6013	{
6014	intArgReg = argReg;
6015	}
6016
6017	fixedIntArgMask \|= genRegMask(intArgReg);
6018
6019	if (intArgReg == REG_ARG_0)
6020	{
6021	assert(firstArgVarNum == BAD_VAR_NUM);
6022	firstArgVarNum = varNum;
6023	}
6024	}
6025	#endif // FEATURE_VARARG
6026	}
6027
6028	#if FEATURE_VARARG && defined(_TARGET_AMD64_)
6029	// Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
6030	// load the remaining arg registers (both int and float) from the corresponding
6031	// shadow stack slots. This is for the reason that we don't know the number and type
6032	// of non-fixed params passed by the caller, therefore we have to assume the worst case
6033	// of caller passing float/double args both in int and float arg regs.
6034	//
6035	// This doesn't apply to x86, which doesn't pass floating point values in floating
6036	// point registers.
6037	//
6038	// The caller could have passed gc-ref/byref type var args. Since these are var args
6039	// the callee no way of knowing their gc-ness. Therefore, mark the region that loads
6040	// remaining arg registers from shadow stack slots as non-gc interruptible.
6041	if (fixedIntArgMask != RBM_NONE)
6042	{
6043	assert(compiler->info.compIsVarArgs);
6044	assert(firstArgVarNum != BAD_VAR_NUM);
6045
6046	regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
6047	if (remainingIntArgMask != RBM_NONE)
6048	{
6049	instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
6050	getEmitter()->emitDisableGC();
6051	for (int argNum = `0`, argOffset = `0`; argNum < MAX_REG_ARG; ++argNum)
6052	{
6053	regNumber argReg = intArgRegs[argNum];
6054	regMaskTP argRegMask = genRegMask(argReg);
6055
6056	if ((remainingIntArgMask & argRegMask) != `0`)
6057	{
6058	remainingIntArgMask &= ~argRegMask;
6059	getEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
6060
6061	// also load it in corresponding float arg reg
6062	regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
6063	inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
6064	}
6065
6066	argOffset += REGSIZE_BYTES;
6067	}
6068	getEmitter()->emitEnableGC();
6069	}
6070	}
6071	#endif // FEATURE_VARARG
6072	}
6073
6074	// produce code for a GT_LEA subnode
6075	void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
6076	{
6077	emitAttr size = emitTypeSize(lea);
6078	genConsumeOperands(lea);
6079
6080	if (lea->Base() && lea->Index())
6081	{
6082	regNumber baseReg = lea->Base()->gtRegNum;
6083	regNumber indexReg = lea->Index()->gtRegNum;
6084	getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, baseReg, indexReg, lea->gtScale, lea->Offset());
6085	}
6086	else if (lea->Base())
6087	{
6088	getEmitter()->emitIns_R_AR(INS_lea, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->Offset());
6089	}
6090	else if (lea->Index())
6091	{
6092	getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, REG_NA, lea->Index()->gtRegNum, lea->gtScale,
6093	lea->Offset());
6094	}
6095
6096	genProduceReg(lea);
6097	}
6098
6099	//-------------------------------------------------------------------------------------------
6100	// genJumpKindsForTree: Determine the number and kinds of conditional branches
6101	// necessary to implement the given GT_CMP node
6102	//
6103	// Arguments:
6104	// cmpTree - (input) The GenTree node that is used to set the Condition codes
6105	// - The GenTree Relop node that was used to set the Condition codes
6106	// jmpKind[2] - (output) One or two conditional branch instructions
6107	// jmpToTrueLabel[2] - (output) When true we branch to the true case
6108	// When false we create a second label and branch to the false case
6109	// Only GT_EQ for a floating point compares can have a false value.
6110	//
6111	// Return Value:
6112	// Sets the proper values into the array elements of jmpKind[] and jmpToTrueLabel[]
6113	//
6114	// Assumptions:
6115	// At least one conditional branch instruction will be returned.
6116	// Typically only one conditional branch is needed
6117	// and the second jmpKind[] value is set to EJ_NONE
6118	//
6119	// Notes:
6120	// jmpToTrueLabel[i]= true implies branch when the compare operation is true.
6121	// jmpToTrueLabel[i]= false implies branch when the compare operation is false.
6122	//-------------------------------------------------------------------------------------------
6123
6124	// static
6125	void CodeGen::genJumpKindsForTree(GenTree* cmpTree, emitJumpKind jmpKind[`2`], bool jmpToTrueLabel[`2`])
6126	{
6127	// Except for BEQ (= ordered GT_EQ) both jumps are to the true label.
6128	jmpToTrueLabel[`0`] = true;
6129	jmpToTrueLabel[`1`] = true;
6130
6131	// For integer comparisons just use genJumpKindForOper
6132	if (!varTypeIsFloating(cmpTree->gtOp.gtOp1))
6133	{
6134	CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != `0`) ? CK_UNSIGNED : CK_SIGNED;
6135	jmpKind[`0`] = genJumpKindForOper(cmpTree->gtOper, compareKind);
6136	jmpKind[`1`] = EJ_NONE;
6137	}
6138	else
6139	{
6140	assert(cmpTree->OperIsCompare());
6141
6142	// For details on how we arrived at this mapping, see the comment block in genCodeForTreeNode()
6143	// while generating code for compare opererators (e.g. GT_EQ etc).
6144	if ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) != `0`)
6145	{
6146	// Must branch if we have an NaN, unordered
6147	switch (cmpTree->gtOper)
6148	{
6149	case GT_LT:
6150	case GT_GT:
6151	jmpKind[`0`] = EJ_jb;
6152	jmpKind[`1`] = EJ_NONE;
6153	break;
6154
6155	case GT_LE:
6156	case GT_GE:
6157	jmpKind[`0`] = EJ_jbe;
6158	jmpKind[`1`] = EJ_NONE;
6159	break;
6160
6161	case GT_NE:
6162	jmpKind[`0`] = EJ_jpe;
6163	jmpKind[`1`] = EJ_jne;
6164	break;
6165
6166	case GT_EQ:
6167	jmpKind[`0`] = EJ_je;
6168	jmpKind[`1`] = EJ_NONE;
6169	break;
6170
6171	default:
6172	unreached();
6173	}
6174	}
6175	else // ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) == 0)
6176	{
6177	// Do not branch if we have an NaN, unordered
6178	switch (cmpTree->gtOper)
6179	{
6180	case GT_LT:
6181	case GT_GT:
6182	jmpKind[`0`] = EJ_ja;
6183	jmpKind[`1`] = EJ_NONE;
6184	break;
6185
6186	case GT_LE:
6187	case GT_GE:
6188	jmpKind[`0`] = EJ_jae;
6189	jmpKind[`1`] = EJ_NONE;
6190	break;
6191
6192	case GT_NE:
6193	jmpKind[`0`] = EJ_jne;
6194	jmpKind[`1`] = EJ_NONE;
6195	break;
6196
6197	case GT_EQ:
6198	jmpKind[`0`] = EJ_jpe;
6199	jmpKind[`1`] = EJ_je;
6200	jmpToTrueLabel[`0`] = false;
6201	break;
6202
6203	default:
6204	unreached();
6205	}
6206	}
6207	}
6208	}
6209
6210	//------------------------------------------------------------------------
6211	// genCompareFloat: Generate code for comparing two floating point values
6212	//
6213	// Arguments:
6214	// treeNode - the compare tree
6215	//
6216	// Return Value:
6217	// None.
6218	// Comments:
6219	// SSE2 instruction ucomis[s\|d] is performs unordered comparison and
6220	// updates rFLAGS register as follows.
6221	// Result of compare ZF PF CF
6222	// ----------------- ------------
6223	// Unordered 1 1 1 <-- this result implies one of operands of compare is a NAN.
6224	// Greater 0 0 0
6225	// Less Than 0 0 1
6226	// Equal 1 0 0
6227	//
6228	// From the above table the following equalities follow. As per ECMA spec .UN opcodes perform*
6229	// unordered comparison of floating point values. That is .UN comparisons result in true when*
6230	// one of the operands is a NaN whereas ordered comparisons results in false.
6231	//
6232	// Opcode Amd64 equivalent Comment
6233	// ------ ----------------- --------
6234	// BLT.UN(a,b) ucomis[s\|d] a, b Jb branches if CF=1, which means either a<b or unordered from the above
6235	// jb table
6236	//
6237	// BLT(a,b) ucomis[s\|d] b, a Ja branches if CF=0 and ZF=0, which means b>a that in turn implies a<b
6238	// ja
6239	//
6240	// BGT.UN(a,b) ucomis[s\|d] b, a branch if b<a or unordered ==> branch if a>b or unordered
6241	// jb
6242	//
6243	// BGT(a, b) ucomis[s\|d] a, b branch if a>b
6244	// ja
6245	//
6246	// BLE.UN(a,b) ucomis[s\|d] a, b jbe branches if CF=1 or ZF=1, which implies a<=b or unordered
6247	// jbe
6248	//
6249	// BLE(a,b) ucomis[s\|d] b, a jae branches if CF=0, which mean b>=a or a<=b
6250	// jae
6251	//
6252	// BGE.UN(a,b) ucomis[s\|d] b, a branch if b<=a or unordered ==> branch if a>=b or unordered
6253	// jbe
6254	//
6255	// BGE(a,b) ucomis[s\|d] a, b branch if a>=b
6256	// jae
6257	//
6258	// BEQ.UN(a,b) ucomis[s\|d] a, b branch if a==b or unordered. There is no BEQ.UN opcode in ECMA spec.
6259	// je This case is given for completeness, in case if JIT generates such
6260	// a gentree internally.
6261	//
6262	// BEQ(a,b) ucomis[s\|d] a, b From the above table, PF=0 and ZF=1 corresponds to a==b.
6263	// jpe L1
6264	// je <true label>
6265	// L1:
6266	//
6267	// BNE(a,b) ucomis[s\|d] a, b branch if a!=b. There is no BNE opcode in ECMA spec. This case is
6268	// jne given for completeness, in case if JIT generates such a gentree
6269	// internally.
6270	//
6271	// BNE.UN(a,b) ucomis[s\|d] a, b From the above table, PF=1 or ZF=0 implies unordered or a!=b
6272	// jpe <true label>
6273	// jne <true label>
6274	//
6275	// As we can see from the above equalities that the operands of a compare operator need to be
6276	// reversed in case of BLT/CLT, BGT.UN/CGT.UN, BLE/CLE, BGE.UN/CGE.UN.
6277	void CodeGen::genCompareFloat(GenTree* treeNode)
6278	{
6279	assert(treeNode->OperIsCompare());
6280
6281	GenTreeOp* tree = treeNode->AsOp();
6282	GenTree* op1 = tree->gtOp1;
6283	GenTree* op2 = tree->gtOp2;
6284	var_types op1Type = op1->TypeGet();
6285	var_types op2Type = op2->TypeGet();
6286
6287	genConsumeOperands(tree);
6288
6289	assert(varTypeIsFloating(op1Type));
6290	assert(op1Type == op2Type);
6291
6292	regNumber targetReg = treeNode->gtRegNum;
6293	instruction ins;
6294	emitAttr cmpAttr;
6295
6296	bool reverseOps;
6297	if ((tree->gtFlags & GTF_RELOP_NAN_UN) != `0`)
6298	{
6299	// Unordered comparison case
6300	reverseOps = (tree->gtOper == GT_GT \|\| tree->gtOper == GT_GE);
6301	}
6302	else
6303	{
6304	reverseOps = (tree->gtOper == GT_LT \|\| tree->gtOper == GT_LE);
6305	}
6306
6307	if (reverseOps)
6308	{
6309	GenTree* tmp = op1;
6310	op1 = op2;
6311	op2 = tmp;
6312	}
6313
6314	ins = ins_FloatCompare(op1Type);
6315	cmpAttr = emitTypeSize(op1Type);
6316
6317	getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
6318
6319	// Are we evaluating this into a register?
6320	if (targetReg != REG_NA)
6321	{
6322	genSetRegToCond(targetReg, tree);
6323	genProduceReg(tree);
6324	}
6325	}
6326
6327	//------------------------------------------------------------------------
6328	// genCompareInt: Generate code for comparing ints or, on amd64, longs.
6329	//
6330	// Arguments:
6331	// treeNode - the compare tree
6332	//
6333	// Return Value:
6334	// None.
6335	void CodeGen::genCompareInt(GenTree* treeNode)
6336	{
6337	assert(treeNode->OperIsCompare() \|\| treeNode->OperIs(GT_CMP));
6338
6339	GenTreeOp* tree = treeNode->AsOp();
6340	GenTree* op1 = tree->gtOp1;
6341	GenTree* op2 = tree->gtOp2;
6342	var_types op1Type = op1->TypeGet();
6343	var_types op2Type = op2->TypeGet();
6344	regNumber targetReg = tree->gtRegNum;
6345
6346	genConsumeOperands(tree);
6347
6348	assert(!op1->isContainedIntOrIImmed());
6349	assert(!varTypeIsFloating(op2Type));
6350
6351	instruction ins;
6352	var_types type = TYP_UNKNOWN;
6353
6354	if (tree->OperIs(GT_TEST_EQ, GT_TEST_NE))
6355	{
6356	ins = INS_test;
6357
6358	// Unlike many xarch instructions TEST doesn't have a form with a 16/32/64 bit first operand and
6359	// an 8 bit immediate second operand. But if the immediate value fits in 8 bits then we can simply
6360	// emit a 8 bit TEST instruction, unless we're targeting x86 and the first operand is a non-byteable
6361	// register.
6362	// Note that lowering does something similar but its main purpose is to allow memory operands to be
6363	// contained so it doesn't handle other kind of operands. It could do more but on x86 that results
6364	// in additional register constrains and that may be worse than wasting 3 bytes on an immediate.
6365	if (
6366	#ifdef _TARGET_X86_
6367	(!op1->isUsedFromReg() \|\| isByteReg(op1->gtRegNum)) &&
6368	#endif
6369	(op2->IsCnsIntOrI() && genSmallTypeCanRepresentValue(TYP_UBYTE, op2->AsIntCon()->IconValue())))
6370	{
6371	type = TYP_UBYTE;
6372	}
6373	}
6374	else if (op1->isUsedFromReg() && op2->IsIntegralConst(`0`))
6375	{
6376	// We're comparing a register to 0 so we can generate "test reg1, reg1"
6377	// instead of the longer "cmp reg1, 0"
6378	ins = INS_test;
6379	op2 = op1;
6380	}
6381	else
6382	{
6383	ins = INS_cmp;
6384	}
6385
6386	if (type == TYP_UNKNOWN)
6387	{
6388	if (op1Type == op2Type)
6389	{
6390	type = op1Type;
6391	}
6392	else if (genTypeSize(op1Type) == genTypeSize(op2Type))
6393	{
6394	// If the types are different but have the same size then we'll use TYP_INT or TYP_LONG.
6395	// This primarily deals with small type mixes (e.g. byte/ubyte) that need to be widened
6396	// and compared as int. We should not get long type mixes here but handle that as well
6397	// just in case.
6398	type = genTypeSize(op1Type) == `8` ? TYP_LONG : TYP_INT;
6399	}
6400	else
6401	{
6402	// In the types are different simply use TYP_INT. This deals with small type/int type
6403	// mixes (e.g. byte/short ubyte/int) that need to be widened and compared as int.
6404	// Lowering is expected to handle any mixes that involve long types (e.g. int/long).
6405	type = TYP_INT;
6406	}
6407
6408	// The common type cannot be smaller than any of the operand types, we're probably mixing int/long
6409	assert(genTypeSize(type) >= max(genTypeSize(op1Type), genTypeSize(op2Type)));
6410	// Small unsigned int types (TYP_BOOL can use anything) should use unsigned comparisons
6411	assert(!(varTypeIsSmallInt(type) && varTypeIsUnsigned(type)) \|\| ((tree->gtFlags & GTF_UNSIGNED) != `0`));
6412	// If op1 is smaller then it cannot be in memory, we're probably missing a cast
6413	assert((genTypeSize(op1Type) >= genTypeSize(type)) \|\| !op1->isUsedFromMemory());
6414	// If op2 is smaller then it cannot be in memory, we're probably missing a cast
6415	assert((genTypeSize(op2Type) >= genTypeSize(type)) \|\| !op2->isUsedFromMemory());
6416	// If we ended up with a small type and op2 is a constant then make sure we don't lose constant bits
6417	assert(!op2->IsCnsIntOrI() \|\| !varTypeIsSmall(type) \|\|
6418	genSmallTypeCanRepresentValue(type, op2->AsIntCon()->IconValue()));
6419	}
6420
6421	// The type cannot be larger than the machine word size
6422	assert(genTypeSize(type) <= genTypeSize(TYP_I_IMPL));
6423	// TYP_UINT and TYP_ULONG should not appear here, only small types can be unsigned
6424	assert(!varTypeIsUnsigned(type) \|\| varTypeIsSmall(type));
6425
6426	getEmitter()->emitInsBinary(ins, emitTypeSize(type), op1, op2);
6427
6428	// Are we evaluating this into a register?
6429	if (targetReg != REG_NA)
6430	{
6431	genSetRegToCond(targetReg, tree);
6432	genProduceReg(tree);
6433	}
6434	}
6435
6436	//-------------------------------------------------------------------------------------------
6437	// genSetRegToCond: Set a register 'dstReg' to the appropriate one or zero value
6438	// corresponding to a binary Relational operator result.
6439	//
6440	// Arguments:
6441	// dstReg - The target register to set to 1 or 0
6442	// tree - The GenTree Relop node that was used to set the Condition codes
6443	//
6444	// Return Value: none
6445	//
6446	// Notes:
6447	// A full 64-bit value of either 1 or 0 is setup in the 'dstReg'
6448	//-------------------------------------------------------------------------------------------
6449
6450	void CodeGen::genSetRegToCond(regNumber dstReg, GenTree* tree)
6451	{
6452	noway_assert((genRegMask(dstReg) & RBM_BYTE_REGS) != `0`);
6453
6454	emitJumpKind jumpKind[`2`];
6455	bool branchToTrueLabel[`2`];
6456	genJumpKindsForTree(tree, jumpKind, branchToTrueLabel);
6457
6458	if (jumpKind[`1`] == EJ_NONE)
6459	{
6460	// Set (lower byte of) reg according to the flags
6461	inst_SET(jumpKind[`0`], dstReg);
6462	}
6463	else
6464	{
6465	#ifdef DEBUG
6466	// jmpKind[1] != EJ_NONE implies BEQ and BEN.UN of floating point values.
6467	// These are represented by two conditions.
6468	if (tree->gtOper == GT_EQ)
6469	{
6470	// This must be an ordered comparison.
6471	assert((tree->gtFlags & GTF_RELOP_NAN_UN) == `0`);
6472	}
6473	else
6474	{
6475	// This must be BNE.UN
6476	assert((tree->gtOper == GT_NE) && ((tree->gtFlags & GTF_RELOP_NAN_UN) != `0`));
6477	}
6478	#endif
6479
6480	// Here is the sample code generated in each case:
6481	// BEQ == cmp, jpe <false label>, je <true label>
6482	// That is, to materialize comparison reg needs to be set if PF=0 and ZF=1
6483	// setnp reg // if (PF==0) reg = 1 else reg = 0
6484	// jpe L1 // Jmp if PF==1
6485	// sete reg
6486	// L1:
6487	//
6488	// BNE.UN == cmp, jpe <true label>, jne <true label>
6489	// That is, to materialize the comparison reg needs to be set if either PF=1 or ZF=0;
6490	// setp reg
6491	// jpe L1
6492	// setne reg
6493	// L1:
6494
6495	// reverse the jmpkind condition before setting dstReg if it is to false label.
6496	inst_SET(branchToTrueLabel[`0`] ? jumpKind[`0`] : emitter::emitReverseJumpKind(jumpKind[`0`]), dstReg);
6497
6498	BasicBlock* label = genCreateTempLabel();
6499	inst_JMP(jumpKind[`0`], label);
6500
6501	// second branch is always to true label
6502	assert(branchToTrueLabel[`1`]);
6503	inst_SET(jumpKind[`1`], dstReg);
6504	genDefineTempLabel(label);
6505	}
6506
6507	var_types treeType = tree->TypeGet();
6508	if (treeType == TYP_INT \|\| treeType == TYP_LONG)
6509	{
6510	// Set the higher bytes to 0
6511	inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
6512	}
6513	else
6514	{
6515	noway_assert(treeType == TYP_BYTE);
6516	}
6517	}
6518
6519	#if !defined(_TARGET_64BIT_)
6520	//------------------------------------------------------------------------
6521	// genLongToIntCast: Generate code for long to int casts on x86.
6522	//
6523	// Arguments:
6524	// cast - The GT_CAST node
6525	//
6526	// Return Value:
6527	// None.
6528	//
6529	// Assumptions:
6530	// The cast node and its sources (via GT_LONG) must have been assigned registers.
6531	// The destination cannot be a floating point type or a small integer type.
6532	//
6533	void CodeGen::genLongToIntCast(GenTree* cast)
6534	{
6535	assert(cast->OperGet() == GT_CAST);
6536
6537	GenTree* src = cast->gtGetOp1();
6538	noway_assert(src->OperGet() == GT_LONG);
6539
6540	genConsumeRegs(src);
6541
6542	var_types srcType = ((cast->gtFlags & GTF_UNSIGNED) != `0`) ? TYP_ULONG : TYP_LONG;
6543	var_types dstType = cast->CastToType();
6544	regNumber loSrcReg = src->gtGetOp1()->gtRegNum;
6545	regNumber hiSrcReg = src->gtGetOp2()->gtRegNum;
6546	regNumber dstReg = cast->gtRegNum;
6547
6548	assert((dstType == TYP_INT) \|\| (dstType == TYP_UINT));
6549	assert(genIsValidIntReg(loSrcReg));
6550	assert(genIsValidIntReg(hiSrcReg));
6551	assert(genIsValidIntReg(dstReg));
6552
6553	if (cast->gtOverflow())
6554	{
6555	//
6556	// Generate an overflow check for [u]long to [u]int casts:
6557	//
6558	// long -> int - check if the upper 33 bits are all 0 or all 1
6559	//
6560	// ulong -> int - check if the upper 33 bits are all 0
6561	//
6562	// long -> uint - check if the upper 32 bits are all 0
6563	// ulong -> uint - check if the upper 32 bits are all 0
6564	//
6565
6566	if ((srcType == TYP_LONG) && (dstType == TYP_INT))
6567	{
6568	BasicBlock* allOne = genCreateTempLabel();
6569	BasicBlock* success = genCreateTempLabel();
6570
6571	inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
6572	inst_JMP(EJ_js, allOne);
6573
6574	inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
6575	genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6576	inst_JMP(EJ_jmp, success);
6577
6578	genDefineTempLabel(allOne);
6579	inst_RV_IV(INS_cmp, hiSrcReg, -`1`, EA_4BYTE);
6580	genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6581
6582	genDefineTempLabel(success);
6583	}
6584	else
6585	{
6586	if ((srcType == TYP_ULONG) && (dstType == TYP_INT))
6587	{
6588	inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
6589	genJumpToThrowHlpBlk(EJ_js, SCK_OVERFLOW);
6590	}
6591
6592	inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
6593	genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6594	}
6595	}
6596
6597	if (dstReg != loSrcReg)
6598	{
6599	inst_RV_RV(INS_mov, dstReg, loSrcReg, TYP_INT, EA_4BYTE);
6600	}
6601
6602	genProduceReg(cast);
6603	}
6604	#endif
6605
6606	//------------------------------------------------------------------------
6607	// genIntCastOverflowCheck: Generate overflow checking code for an integer cast.
6608	//
6609	// Arguments:
6610	// cast - The GT_CAST node
6611	// desc - The cast description
6612	// reg - The register containing the value to check
6613	//
6614	void CodeGen::genIntCastOverflowCheck(GenTreeCast* cast, const GenIntCastDesc& desc, regNumber reg)
6615	{
6616	switch (desc.CheckKind())
6617	{
6618	case GenIntCastDesc::CHECK_POSITIVE:
6619	getEmitter()->emitIns_R_R(INS_test, EA_SIZE(desc.CheckSrcSize()), reg, reg);
6620	genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6621	break;
6622
6623	#ifdef _TARGET_64BIT_
6624	case GenIntCastDesc::CHECK_UINT_RANGE:
6625	{
6626	// We need to check if the value is not greater than 0xFFFFFFFF but this value
6627	// cannot be encoded in an immediate operand. Use a right shift to test if the
6628	// upper 32 bits are zero. This requires a temporary register.
6629	const regNumber tempReg = cast->GetSingleTempReg();
6630	assert(tempReg != reg);
6631	getEmitter()->emitIns_R_R(INS_mov, EA_8BYTE, tempReg, reg);
6632	getEmitter()->emitIns_R_I(INS_shr_N, EA_8BYTE, tempReg, `32`);
6633	genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
6634	}
6635	break;
6636
6637	case GenIntCastDesc::CHECK_POSITIVE_INT_RANGE:
6638	getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
6639	genJumpToThrowHlpBlk(EJ_ja, SCK_OVERFLOW);
6640	break;
6641
6642	case GenIntCastDesc::CHECK_INT_RANGE:
6643	getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
6644	genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
6645	getEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MIN);
6646	genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6647	break;
6648	#endif
6649
6650	default:
6651	{
6652	assert(desc.CheckKind() == GenIntCastDesc::CHECK_SMALL_INT_RANGE);
6653	const int castMaxValue = desc.CheckSmallIntMax();
6654	const int castMinValue = desc.CheckSmallIntMin();
6655
6656	getEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMaxValue);
6657	genJumpToThrowHlpBlk((castMinValue == `0`) ? EJ_ja : EJ_jg, SCK_OVERFLOW);
6658
6659	if (castMinValue != `0`)
6660	{
6661	getEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMinValue);
6662	genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
6663	}
6664	}
6665	break;
6666	}
6667	}
6668
6669	//------------------------------------------------------------------------
6670	// genIntToIntCast: Generate code for an integer cast, with or without overflow check.
6671	//
6672	// Arguments:
6673	// cast - The GT_CAST node
6674	//
6675	// Assumptions:
6676	// The cast node is not a contained node and must have an assigned register.
6677	// Neither the source nor target type can be a floating point type.
6678	// On x86 casts to (U)BYTE require that the source be in a byte register.
6679	//
6680	// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
6681	//
6682	void CodeGen::genIntToIntCast(GenTreeCast* cast)
6683	{
6684	genConsumeRegs(cast->gtGetOp1());
6685
6686	const regNumber srcReg = cast->gtGetOp1()->gtRegNum;
6687	const regNumber dstReg = cast->gtRegNum;
6688
6689	assert(genIsValidIntReg(srcReg));
6690	assert(genIsValidIntReg(dstReg));
6691
6692	GenIntCastDesc desc(cast);
6693
6694	if (desc.CheckKind() != GenIntCastDesc::CHECK_NONE)
6695	{
6696	genIntCastOverflowCheck(cast, desc, srcReg);
6697	}
6698
6699	if ((desc.ExtendKind() != GenIntCastDesc::COPY) \|\| (srcReg != dstReg))
6700	{
6701	instruction ins;
6702	unsigned insSize;
6703
6704	switch (desc.ExtendKind())
6705	{
6706	case GenIntCastDesc::ZERO_EXTEND_SMALL_INT:
6707	ins = INS_movzx;
6708	insSize = desc.ExtendSrcSize();
6709	break;
6710	case GenIntCastDesc::SIGN_EXTEND_SMALL_INT:
6711	ins = INS_movsx;
6712	insSize = desc.ExtendSrcSize();
6713	break;
6714	#ifdef _TARGET_64BIT_
6715	case GenIntCastDesc::ZERO_EXTEND_INT:
6716	ins = INS_mov;
6717	insSize = `4`;
6718	break;
6719	case GenIntCastDesc::SIGN_EXTEND_INT:
6720	ins = INS_movsxd;
6721	insSize = `4`;
6722	break;
6723	#endif
6724	default:
6725	assert(desc.ExtendKind() == GenIntCastDesc::COPY);
6726	ins = INS_mov;
6727	insSize = desc.ExtendSrcSize();
6728	break;
6729	}
6730
6731	getEmitter()->emitIns_R_R(ins, EA_ATTR(insSize), dstReg, srcReg);
6732	}
6733
6734	genProduceReg(cast);
6735	}
6736
6737	//------------------------------------------------------------------------
6738	// genFloatToFloatCast: Generate code for a cast between float and double
6739	//
6740	// Arguments:
6741	// treeNode - The GT_CAST node
6742	//
6743	// Return Value:
6744	// None.
6745	//
6746	// Assumptions:
6747	// Cast is a non-overflow conversion.
6748	// The treeNode must have an assigned register.
6749	// The cast is between float and double or vice versa.
6750	//
6751	void CodeGen::genFloatToFloatCast(GenTree* treeNode)
6752	{
6753	// float <--> double conversions are always non-overflow ones
6754	assert(treeNode->OperGet() == GT_CAST);
6755	assert(!treeNode->gtOverflow());
6756
6757	regNumber targetReg = treeNode->gtRegNum;
6758	assert(genIsValidFloatReg(targetReg));
6759
6760	GenTree* op1 = treeNode->gtOp.gtOp1;
6761	#ifdef DEBUG
6762	// If not contained, must be a valid float reg.
6763	if (op1->isUsedFromReg())
6764	{
6765	assert(genIsValidFloatReg(op1->gtRegNum));
6766	}
6767	#endif
6768
6769	var_types dstType = treeNode->CastToType();
6770	var_types srcType = op1->TypeGet();
6771	assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
6772
6773	genConsumeOperands(treeNode->AsOp());
6774	if (srcType == dstType && (op1->isUsedFromReg() && (targetReg == op1->gtRegNum)))
6775	{
6776	// source and destinations types are the same and also reside in the same register.
6777	// we just need to consume and produce the reg in this case.
6778	;
6779	}
6780	else
6781	{
6782	instruction ins = ins_FloatConv(dstType, srcType);
6783	getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
6784	}
6785
6786	genProduceReg(treeNode);
6787	}
6788
6789	//------------------------------------------------------------------------
6790	// genIntToFloatCast: Generate code to cast an int/long to float/double
6791	//
6792	// Arguments:
6793	// treeNode - The GT_CAST node
6794	//
6795	// Return Value:
6796	// None.
6797	//
6798	// Assumptions:
6799	// Cast is a non-overflow conversion.
6800	// The treeNode must have an assigned register.
6801	// SrcType= int32/uint32/int64/uint64 and DstType=float/double.
6802	//
6803	void CodeGen::genIntToFloatCast(GenTree* treeNode)
6804	{
6805	// int type --> float/double conversions are always non-overflow ones
6806	assert(treeNode->OperGet() == GT_CAST);
6807	assert(!treeNode->gtOverflow());
6808
6809	regNumber targetReg = treeNode->gtRegNum;
6810	assert(genIsValidFloatReg(targetReg));
6811
6812	GenTree* op1 = treeNode->gtOp.gtOp1;
6813	#ifdef DEBUG
6814	if (op1->isUsedFromReg())
6815	{
6816	assert(genIsValidIntReg(op1->gtRegNum));
6817	}
6818	#endif
6819
6820	var_types dstType = treeNode->CastToType();
6821	var_types srcType = op1->TypeGet();
6822	assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
6823
6824	#if !defined(_TARGET_64BIT_)
6825	// We expect morph to replace long to float/double casts with helper calls
6826	noway_assert(!varTypeIsLong(srcType));
6827	#endif // !defined(_TARGET_64BIT_)
6828
6829	// Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
6830	// ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except
6831	// for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
6832	// as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
6833	// Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
6834	// temp and using temp as operand of cast operation.
6835	if (srcType == TYP_BYREF)
6836	{
6837	noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR \|\| op1->OperGet() == GT_LCL_FLD_ADDR);
6838	srcType = TYP_I_IMPL;
6839	}
6840
6841	// force the srcType to unsigned if GT_UNSIGNED flag is set
6842	if (treeNode->gtFlags & GTF_UNSIGNED)
6843	{
6844	srcType = genUnsignedType(srcType);
6845	}
6846
6847	noway_assert(!varTypeIsGC(srcType));
6848
6849	// We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
6850	// For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
6851	// either the front-end or lowering phase to have generated two levels of cast.
6852	// The first one is for widening smaller int type to int32 and the second one is
6853	// to the float/double.
6854	emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
6855	noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) \|\| (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
6856
6857	// Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
6858	// here since they should have been lowered apropriately.
6859	noway_assert(srcType != TYP_UINT);
6860	noway_assert((srcType != TYP_ULONG) \|\| (dstType != TYP_FLOAT));
6861
6862	// To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
6863	// which does a partial write to lower 4/8 bytes of xmm register keeping the other
6864	// upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
6865	// the partial write could introduce a false dependency and could cause a stall
6866	// if there are further uses of xmmReg. We have such a case occurring with a
6867	// customer reported version of SpectralNorm benchmark, resulting in 2x perf
6868	// regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
6869	// cvtsi2ss/sd instruction.
6870
6871	genConsumeOperands(treeNode->AsOp());
6872	getEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->gtRegNum, treeNode->gtRegNum);
6873
6874	// Note that here we need to specify srcType that will determine
6875	// the size of source reg/mem operand and rex.w prefix.
6876	instruction ins = ins_FloatConv(dstType, TYP_INT);
6877	getEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
6878
6879	// Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
6880	// will interpret ULONG value as LONG. Hence we need to adjust the
6881	// result if sign-bit of srcType is set.
6882	if (srcType == TYP_ULONG)
6883	{
6884	// The instruction sequence below is less accurate than what clang
6885	// and gcc generate. However, we keep the current sequence for backward compatibility.
6886	// If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
6887	// should be also updated for consistent conversion result.
6888	assert(dstType == TYP_DOUBLE);
6889	assert(op1->isUsedFromReg());
6890
6891	// Set the flags without modifying op1.
6892	// test op1Reg, op1Reg
6893	inst_RV_RV(INS_test, op1->gtRegNum, op1->gtRegNum, srcType);
6894
6895	// No need to adjust result if op1 >= 0 i.e. positive
6896	// Jge label
6897	BasicBlock* label = genCreateTempLabel();
6898	inst_JMP(EJ_jge, label);
6899
6900	// Adjust the result
6901	// result = result + 0x43f00000 00000000
6902	// addsd resultReg, 0x43f00000 00000000
6903	CORINFO_FIELD_HANDLE* cns = &u8ToDblBitmask;
6904	if (cns == nullptr*)
6905	{
6906	double d;
6907	static_assert_no_msg(sizeof(double) == sizeof(__int64));
6908	((__int64**)&d) = `0x43f0000000000000LL`;
6909
6910	*cns = getEmitter()->emitFltOrDblConst(d, EA_8BYTE);
6911	}
6912	getEmitter()->emitIns_R_C(INS_addsd, EA_8BYTE, treeNode->gtRegNum, *cns, `0`);
6913
6914	genDefineTempLabel(label);
6915	}
6916
6917	genProduceReg(treeNode);
6918	}
6919
6920	//------------------------------------------------------------------------
6921	// genFloatToIntCast: Generate code to cast float/double to int/long
6922	//
6923	// Arguments:
6924	// treeNode - The GT_CAST node
6925	//
6926	// Return Value:
6927	// None.
6928	//
6929	// Assumptions:
6930	// Cast is a non-overflow conversion.
6931	// The treeNode must have an assigned register.
6932	// SrcType=float/double and DstType= int32/uint32/int64/uint64
6933	//
6934	// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
6935	//
6936	void CodeGen::genFloatToIntCast(GenTree* treeNode)
6937	{
6938	// we don't expect to see overflow detecting float/double --> int type conversions here
6939	// as they should have been converted into helper calls by front-end.
6940	assert(treeNode->OperGet() == GT_CAST);
6941	assert(!treeNode->gtOverflow());
6942
6943	regNumber targetReg = treeNode->gtRegNum;
6944	assert(genIsValidIntReg(targetReg));
6945
6946	GenTree* op1 = treeNode->gtOp.gtOp1;
6947	#ifdef DEBUG
6948	if (op1->isUsedFromReg())
6949	{
6950	assert(genIsValidFloatReg(op1->gtRegNum));
6951	}
6952	#endif
6953
6954	var_types dstType = treeNode->CastToType();
6955	var_types srcType = op1->TypeGet();
6956	assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
6957
6958	// We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
6959	// For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
6960	// front-end or lowering phase to have generated two levels of cast. The first one is
6961	// for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
6962	// the required smaller int type.
6963	emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
6964	noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) \|\| (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
6965
6966	// We shouldn't be seeing uint64 here as it should have been converted
6967	// into a helper call by either front-end or lowering phase.
6968	noway_assert(!varTypeIsUnsigned(dstType) \|\| (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
6969
6970	// If the dstType is TYP_UINT, we have 32-bits to encode the
6971	// float number. Any of 33rd or above bits can be the sign bit.
6972	// To achieve it we pretend as if we are converting it to a long.
6973	if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
6974	{
6975	dstType = TYP_LONG;
6976	}
6977
6978	// Note that we need to specify dstType here so that it will determine
6979	// the size of destination integer register and also the rex.w prefix.
6980	genConsumeOperands(treeNode->AsOp());
6981	instruction ins = ins_FloatConv(TYP_INT, srcType);
6982	getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
6983	genProduceReg(treeNode);
6984	}
6985
6986	//------------------------------------------------------------------------
6987	// genCkfinite: Generate code for ckfinite opcode.
6988	//
6989	// Arguments:
6990	// treeNode - The GT_CKFINITE node
6991	//
6992	// Return Value:
6993	// None.
6994	//
6995	// Assumptions:
6996	// GT_CKFINITE node has reserved an internal register.
6997	//
6998	// TODO-XArch-CQ - mark the operand as contained if known to be in
6999	// memory (e.g. field or an array element).
7000	//
7001	void CodeGen::genCkfinite(GenTree* treeNode)
7002	{
7003	assert(treeNode->OperGet() == GT_CKFINITE);
7004
7005	GenTree* op1 = treeNode->gtOp.gtOp1;
7006	var_types targetType = treeNode->TypeGet();
7007	int expMask = (targetType == TYP_FLOAT) ? `0x7F800000` : `0x7FF00000`; // Bit mask to extract exponent.
7008	regNumber targetReg = treeNode->gtRegNum;
7009
7010	// Extract exponent into a register.
7011	regNumber tmpReg = treeNode->GetSingleTempReg();
7012
7013	genConsumeReg(op1);
7014
7015	#ifdef _TARGET_64BIT_
7016
7017	// Copy the floating-point value to an integer register. If we copied a float to a long, then
7018	// right-shift the value so the high 32 bits of the floating-point value sit in the low 32
7019	// bits of the integer register.
7020	instruction ins = ins_CopyFloatToInt(targetType, (targetType == TYP_FLOAT) ? TYP_INT : TYP_LONG);
7021	inst_RV_RV(ins, op1->gtRegNum, tmpReg, targetType);
7022	if (targetType == TYP_DOUBLE)
7023	{
7024	// right shift by 32 bits to get to exponent.
7025	inst_RV_SH(INS_shr, EA_8BYTE, tmpReg, `32`);
7026	}
7027
7028	// Mask exponent with all 1's and check if the exponent is all 1's
7029	inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
7030	inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
7031
7032	// If exponent is all 1's, throw ArithmeticException
7033	genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
7034
7035	// if it is a finite value copy it to targetReg
7036	if (targetReg != op1->gtRegNum)
7037	{
7038	inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7039	}
7040
7041	#else // !_TARGET_64BIT_
7042
7043	// If the target type is TYP_DOUBLE, we want to extract the high 32 bits into the register.
7044	// There is no easy way to do this. To not require an extra register, we'll use shuffles
7045	// to move the high 32 bits into the low 32 bits, then shuffle it back, since we
7046	// need to produce the value into the target register.
7047	//
7048	// For TYP_DOUBLE, we'll generate (for targetReg != op1->gtRegNum):
7049	// movaps targetReg, op1->gtRegNum
7050	// shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
7051	// mov_xmm2i tmpReg, targetReg // tmpReg <= Y
7052	// and tmpReg, <mask>
7053	// cmp tmpReg, <mask>
7054	// je <throw block>
7055	// movaps targetReg, op1->gtRegNum // copy the value again, instead of un-shuffling it
7056	//
7057	// For TYP_DOUBLE with (targetReg == op1->gtRegNum):
7058	// shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
7059	// mov_xmm2i tmpReg, targetReg // tmpReg <= Y
7060	// and tmpReg, <mask>
7061	// cmp tmpReg, <mask>
7062	// je <throw block>
7063	// shufps targetReg, targetReg, 0xB1 // ZWXY => WZYX
7064	//
7065	// For TYP_FLOAT, it's the same as _TARGET_64BIT_:
7066	// mov_xmm2i tmpReg, targetReg // tmpReg <= low 32 bits
7067	// and tmpReg, <mask>
7068	// cmp tmpReg, <mask>
7069	// je <throw block>
7070	// movaps targetReg, op1->gtRegNum // only if targetReg != op1->gtRegNum
7071
7072	regNumber copyToTmpSrcReg; // The register we'll copy to the integer temp.
7073
7074	if (targetType == TYP_DOUBLE)
7075	{
7076	if (targetReg != op1->gtRegNum)
7077	{
7078	inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7079	}
7080	inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, `0xb1`);
7081	copyToTmpSrcReg = targetReg;
7082	}
7083	else
7084	{
7085	copyToTmpSrcReg = op1->gtRegNum;
7086	}
7087
7088	// Copy only the low 32 bits. This will be the high order 32 bits of the floating-point
7089	// value, no matter the floating-point type.
7090	inst_RV_RV(ins_CopyFloatToInt(TYP_FLOAT, TYP_INT), copyToTmpSrcReg, tmpReg, TYP_FLOAT);
7091
7092	// Mask exponent with all 1's and check if the exponent is all 1's
7093	inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
7094	inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
7095
7096	// If exponent is all 1's, throw ArithmeticException
7097	genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
7098
7099	if (targetReg != op1->gtRegNum)
7100	{
7101	// In both the TYP_FLOAT and TYP_DOUBLE case, the op1 register is untouched,
7102	// so copy it to the targetReg. This is faster and smaller for TYP_DOUBLE
7103	// than re-shuffling the targetReg.
7104	inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
7105	}
7106	else if (targetType == TYP_DOUBLE)
7107	{
7108	// We need to re-shuffle the targetReg to get the correct result.
7109	inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, `0xb1`);
7110	}
7111
7112	#endif // !_TARGET_64BIT_
7113
7114	genProduceReg(treeNode);
7115	}
7116
7117	#ifdef _TARGET_AMD64_
7118	int CodeGenInterface::genSPtoFPdelta()
7119	{
7120	int delta;
7121
7122	#ifdef UNIX_AMD64_ABI
7123
7124	// We require frame chaining on Unix to support native tool unwinding (such as
7125	// unwinding by the native debugger). We have a CLR-only extension to the
7126	// unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
7127	// If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
7128	delta = genTotalFrameSize();
7129
7130	#else // !UNIX_AMD64_ABI
7131
7132	// As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
7133	// RBP needs to be reported in unwind codes. This case would arise for methods
7134	// with localloc.
7135	if (compiler->compLocallocUsed)
7136	{
7137	// We cannot base delta computation on compLclFrameSize since it changes from
7138	// tentative to final frame layout and hence there is a possibility of
7139	// under-estimating offset of vars from FP, which in turn results in under-
7140	// estimating instruction size.
7141	//
7142	// To be predictive and so as never to under-estimate offset of vars from FP
7143	// we will always position FP at min(240, outgoing arg area size).
7144	delta = Min(`240`, (int)compiler->lvaOutgoingArgSpaceSize);
7145	}
7146	else if (compiler->opts.compDbgEnC)
7147	{
7148	// vm assumption on EnC methods is that rsp and rbp are equal
7149	delta = `0`;
7150	}
7151	else
7152	{
7153	delta = genTotalFrameSize();
7154	}
7155
7156	#endif // !UNIX_AMD64_ABI
7157
7158	return delta;
7159	}
7160
7161	//---------------------------------------------------------------------
7162	// genTotalFrameSize - return the total size of the stack frame, including local size,
7163	// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
7164	// return address.
7165	//
7166	// Return value:
7167	// Total frame size
7168	//
7169
7170	int CodeGenInterface::genTotalFrameSize()
7171	{
7172	assert(!IsUninitialized(compiler->compCalleeRegsPushed));
7173
7174	int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
7175
7176	assert(totalFrameSize >= `0`);
7177	return totalFrameSize;
7178	}
7179
7180	//---------------------------------------------------------------------
7181	// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
7182	// This number is going to be negative, since the Caller-SP is at a higher
7183	// address than the frame pointer.
7184	//
7185	// There must be a frame pointer to call this function!
7186	//
7187	// We can't compute this directly from the Caller-SP, since the frame pointer
7188	// is based on a maximum delta from Initial-SP, so first we find SP, then
7189	// compute the FP offset.
7190
7191	int CodeGenInterface::genCallerSPtoFPdelta()
7192	{
7193	assert(isFramePointerUsed());
7194	int callerSPtoFPdelta;
7195
7196	callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
7197
7198	assert(callerSPtoFPdelta <= `0`);
7199	return callerSPtoFPdelta;
7200	}
7201
7202	//---------------------------------------------------------------------
7203	// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
7204	//
7205	// This number will be negative.
7206
7207	int CodeGenInterface::genCallerSPtoInitialSPdelta()
7208	{
7209	int callerSPtoSPdelta = `0`;
7210
7211	callerSPtoSPdelta -= genTotalFrameSize();
7212	callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
7213
7214	// compCalleeRegsPushed does not account for the frame pointer
7215	// TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
7216	if (isFramePointerUsed())
7217	{
7218	callerSPtoSPdelta -= REGSIZE_BYTES;
7219	}
7220
7221	assert(callerSPtoSPdelta <= `0`);
7222	return callerSPtoSPdelta;
7223	}
7224	#endif // _TARGET_AMD64_
7225
7226	//-----------------------------------------------------------------------------------------
7227	// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
7228	//
7229	// Arguments:
7230	// treeNode - tree node
7231	//
7232	// Return value:
7233	// None
7234	//
7235	// Assumptions:
7236	// i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
7237	// ii) tree type is floating point type.
7238	// iii) caller of this routine needs to call genProduceReg()
7239	void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
7240	{
7241	regNumber targetReg = treeNode->gtRegNum;
7242	var_types targetType = treeNode->TypeGet();
7243	assert(varTypeIsFloating(targetType));
7244
7245	float f;
7246	double d;
7247	CORINFO_FIELD_HANDLE* bitMask = nullptr;
7248	instruction ins = INS_invalid;
7249	void* cnsAddr = nullptr;
7250	bool dblAlign = false;
7251
7252	switch (treeNode->OperGet())
7253	{
7254	case GT_NEG:
7255	// Neg(x) = flip the sign bit.
7256	// Neg(f) = f ^ 0x80000000
7257	// Neg(d) = d ^ 0x8000000000000000
7258	ins = INS_xorps;
7259	if (targetType == TYP_FLOAT)
7260	{
7261	bitMask = &negBitmaskFlt;
7262
7263	static_assert_no_msg(sizeof(float) == sizeof(int));
7264	((int**)&f) = `0x80000000`;
7265	cnsAddr = &f;
7266	}
7267	else
7268	{
7269	bitMask = &negBitmaskDbl;
7270
7271	static_assert_no_msg(sizeof(double) == sizeof(__int64));
7272	((__int64**)&d) = `0x8000000000000000LL`;
7273	cnsAddr = &d;
7274	dblAlign = true;
7275	}
7276	break;
7277
7278	case GT_INTRINSIC:
7279	assert(treeNode->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs);
7280
7281	// Abs(x) = set sign-bit to zero
7282	// Abs(f) = f & 0x7fffffff
7283	// Abs(d) = d & 0x7fffffffffffffff
7284	ins = INS_andps;
7285	if (targetType == TYP_FLOAT)
7286	{
7287	bitMask = &absBitmaskFlt;
7288
7289	static_assert_no_msg(sizeof(float) == sizeof(int));
7290	((int**)&f) = `0x7fffffff`;
7291	cnsAddr = &f;
7292	}
7293	else
7294	{
7295	bitMask = &absBitmaskDbl;
7296
7297	static_assert_no_msg(sizeof(double) == sizeof(__int64));
7298	((__int64**)&d) = `0x7fffffffffffffffLL`;
7299	cnsAddr = &d;
7300	dblAlign = true;
7301	}
7302	break;
7303
7304	default:
7305	assert(!"genSSE2: unsupported oper");
7306	unreached();
7307	break;
7308	}
7309
7310	if (bitMask == nullptr*)
7311	{
7312	assert(cnsAddr != nullptr);
7313	*bitMask = getEmitter()->emitAnyConst(cnsAddr, genTypeSize(targetType), dblAlign);
7314	}
7315
7316	// We need an additional register for bitmask.
7317	regNumber tmpReg = treeNode->GetSingleTempReg();
7318
7319	// Move operand into targetReg only if the reg reserved for
7320	// internal purpose is not the same as targetReg.
7321	GenTree* op1 = treeNode->gtOp.gtOp1;
7322	assert(op1->isUsedFromReg());
7323	regNumber operandReg = genConsumeReg(op1);
7324	if (tmpReg != targetReg)
7325	{
7326	if (operandReg != targetReg)
7327	{
7328	inst_RV_RV(ins_Copy(targetType), targetReg, operandReg, targetType);
7329	}
7330
7331	operandReg = tmpReg;
7332	}
7333
7334	getEmitter()->emitIns_R_C(ins_Load(targetType, false), emitTypeSize(targetType), tmpReg, *bitMask, `0`);
7335	assert(ins != INS_invalid);
7336	inst_RV_RV(ins, targetReg, operandReg, targetType);
7337	}
7338
7339	//-----------------------------------------------------------------------------------------
7340	// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation
7341	//
7342	// Arguments:
7343	// treeNode - tree node
7344	//
7345	// Return value:
7346	// None
7347	//
7348	// Assumptions:
7349	// i) SSE4.1 is supported by the underlying hardware
7350	// ii) treeNode oper is a GT_INTRINSIC
7351	// iii) treeNode type is a floating point type
7352	// iv) treeNode is not used from memory
7353	// v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor
7354	// vi) caller of this routine needs to call genProduceReg()
7355	void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)
7356	{
7357	// i) SSE4.1 is supported by the underlying hardware
7358	assert(compiler->compSupports(InstructionSet_SSE41));
7359
7360	// ii) treeNode oper is a GT_INTRINSIC
7361	assert(treeNode->OperGet() == GT_INTRINSIC);
7362
7363	GenTree* srcNode = treeNode->gtGetOp1();
7364
7365	// iii) treeNode type is floating point type
7366	assert(varTypeIsFloating(srcNode));
7367	assert(srcNode->TypeGet() == treeNode->TypeGet());
7368
7369	// iv) treeNode is not used from memory
7370	assert(!treeNode->isUsedFromMemory());
7371
7372	genConsumeOperands(treeNode);
7373
7374	instruction ins = (treeNode->TypeGet() == TYP_FLOAT) ? INS_roundss : INS_roundsd;
7375	emitAttr size = emitTypeSize(treeNode);
7376
7377	regNumber dstReg = treeNode->gtRegNum;
7378
7379	unsigned ival = `0`;
7380
7381	// v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor
7382	switch (treeNode->gtIntrinsic.gtIntrinsicId)
7383	{
7384	case CORINFO_INTRINSIC_Round:
7385	ival = `4`;
7386	break;
7387
7388	case CORINFO_INTRINSIC_Ceiling:
7389	ival = `10`;
7390	break;
7391
7392	case CORINFO_INTRINSIC_Floor:
7393	ival = `9`;
7394	break;
7395
7396	default:
7397	ins = INS_invalid;
7398	assert(!"genSSE41RoundOp: unsupported intrinsic");
7399	unreached();
7400	}
7401
7402	if (srcNode->isContained() \|\| srcNode->isUsedFromSpillTemp())
7403	{
7404	emitter* emit = getEmitter();
7405
7406	TempDsc* tmpDsc = nullptr;
7407	unsigned varNum = BAD_VAR_NUM;
7408	unsigned offset = (unsigned)-`1`;
7409
7410	if (srcNode->isUsedFromSpillTemp())
7411	{
7412	assert(srcNode->IsRegOptional());
7413
7414	tmpDsc = getSpillTempDsc(srcNode);
7415	varNum = tmpDsc->tdTempNum();
7416	offset = `0`;
7417
7418	regSet.tmpRlsTemp(tmpDsc);
7419	}
7420	else if (srcNode->isIndir())
7421	{
7422	GenTreeIndir* memIndir = srcNode->AsIndir();
7423	GenTree* memBase = memIndir->gtOp1;
7424
7425	switch (memBase->OperGet())
7426	{
7427	case GT_LCL_VAR_ADDR:
7428	{
7429	varNum = memBase->AsLclVarCommon()->GetLclNum();
7430	offset = `0`;
7431
7432	// Ensure that all the GenTreeIndir values are set to their defaults.
7433	assert(memBase->gtRegNum == REG_NA);
7434	assert(!memIndir->HasIndex());
7435	assert(memIndir->Scale() == `1`);
7436	assert(memIndir->Offset() == `0`);
7437
7438	break;
7439	}
7440
7441	case GT_CLS_VAR_ADDR:
7442	{
7443	emit->emitIns_R_C_I(ins, size, dstReg, memBase->gtClsVar.gtClsVarHnd, `0`, ival);
7444	return;
7445	}
7446
7447	default:
7448	{
7449	emit->emitIns_R_A_I(ins, size, dstReg, memIndir, ival);
7450	return;
7451	}
7452	}
7453	}
7454	else
7455	{
7456	switch (srcNode->OperGet())
7457	{
7458	case GT_CNS_DBL:
7459	{
7460	GenTreeDblCon* dblConst = srcNode->AsDblCon();
7461	CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(dblConst->gtDconVal, emitTypeSize(dblConst));
7462
7463	emit->emitIns_R_C_I(ins, size, dstReg, hnd, `0`, ival);
7464	return;
7465	}
7466
7467	case GT_LCL_FLD:
7468	{
7469	GenTreeLclFld* lclField = srcNode->AsLclFld();
7470
7471	varNum = lclField->GetLclNum();
7472	offset = lclField->gtLclFld.gtLclOffs;
7473	break;
7474	}
7475
7476	case GT_LCL_VAR:
7477	{
7478	assert(srcNode->IsRegOptional() \|\|
7479	!compiler->lvaTable[srcNode->gtLclVar.gtLclNum].lvIsRegCandidate());
7480
7481	varNum = srcNode->AsLclVar()->GetLclNum();
7482	offset = `0`;
7483	break;
7484	}
7485
7486	default:
7487	unreached();
7488	break;
7489	}
7490	}
7491
7492	// Ensure we got a good varNum and offset.
7493	// We also need to check for `tmpDsc != nullptr` since spill temp numbers
7494	// are negative and start with -1, which also happens to be BAD_VAR_NUM.
7495	assert((varNum != BAD_VAR_NUM) \|\| (tmpDsc != nullptr));
7496	assert(offset != (unsigned)-`1`);
7497
7498	emit->emitIns_R_S_I(ins, size, dstReg, varNum, offset, ival);
7499	}
7500	else
7501	{
7502	inst_RV_RV_IV(ins, size, dstReg, srcNode->gtRegNum, ival);
7503	}
7504	}
7505
7506	//---------------------------------------------------------------------
7507	// genIntrinsic - generate code for a given intrinsic
7508	//
7509	// Arguments
7510	// treeNode - the GT_INTRINSIC node
7511	//
7512	// Return value:
7513	// None
7514	//
7515	void CodeGen::genIntrinsic(GenTree* treeNode)
7516	{
7517	// Right now only Sqrt/Abs are treated as math intrinsics.
7518	switch (treeNode->gtIntrinsic.gtIntrinsicId)
7519	{
7520	case CORINFO_INTRINSIC_Sqrt:
7521	{
7522	// Both operand and its result must be of the same floating point type.
7523	GenTree* srcNode = treeNode->gtOp.gtOp1;
7524	assert(varTypeIsFloating(srcNode));
7525	assert(srcNode->TypeGet() == treeNode->TypeGet());
7526
7527	genConsumeOperands(treeNode->AsOp());
7528	getEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode, srcNode);
7529	break;
7530	}
7531
7532	case CORINFO_INTRINSIC_Abs:
7533	genSSE2BitwiseOp(treeNode);
7534	break;
7535
7536	case CORINFO_INTRINSIC_Round:
7537	case CORINFO_INTRINSIC_Ceiling:
7538	case CORINFO_INTRINSIC_Floor:
7539	genSSE41RoundOp(treeNode->AsOp());
7540	break;
7541
7542	default:
7543	assert(!"genIntrinsic: Unsupported intrinsic");
7544	unreached();
7545	}
7546
7547	genProduceReg(treeNode);
7548	}
7549
7550	//-------------------------------------------------------------------------- //
7551	// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
7552	//
7553	// Arguments
7554	// treeNode - the GT_PUTARG_STK node
7555	//
7556	// Return value:
7557	// The number of the base variable.
7558	//
7559	// Note:
7560	// If tail call the outgoing args are placed in the caller's incoming arg stack space.
7561	// Otherwise, they go in the outgoing arg area on the current frame.
7562	//
7563	// On Windows the caller always creates slots (homing space) in its frame for the
7564	// first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
7565	// For System V systems there is no such calling convention requirement, and the code needs to find
7566	// the first stack passed argument from the caller. This is done by iterating over
7567	// all the lvParam variables and finding the first with lvArgReg equals to REG_STK.
7568	//
7569	unsigned CodeGen::getBaseVarForPutArgStk(GenTree* treeNode)
7570	{
7571	assert(treeNode->OperGet() == GT_PUTARG_STK);
7572
7573	unsigned baseVarNum;
7574
7575	// Whether to setup stk arg in incoming or out-going arg area?
7576	// Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
7577	// All other calls - stk arg is setup in out-going arg area.
7578	if (treeNode->AsPutArgStk()->putInIncomingArgArea())
7579	{
7580	// See the note in the function header re: finding the first stack passed argument.
7581	baseVarNum = getFirstArgWithStackSlot();
7582	assert(baseVarNum != BAD_VAR_NUM);
7583
7584	#ifdef DEBUG
7585	// This must be a fast tail call.
7586	assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
7587
7588	// Since it is a fast tail call, the existence of first incoming arg is guaranteed
7589	// because fast tail call requires that in-coming arg area of caller is >= out-going
7590	// arg area required for tail call.
7591	LclVarDsc* varDsc = &(compiler->lvaTable[baseVarNum]);
7592	assert(varDsc != nullptr);
7593
7594	#ifdef UNIX_AMD64_ABI
7595	assert(!varDsc->lvIsRegArg && varDsc->lvArgReg == REG_STK);
7596	#else // !UNIX_AMD64_ABI
7597	// On Windows this assert is always true. The first argument will always be in REG_ARG_0 or REG_FLTARG_0.
7598	assert(varDsc->lvIsRegArg && (varDsc->lvArgReg == REG_ARG_0 \|\| varDsc->lvArgReg == REG_FLTARG_0));
7599	#endif // !UNIX_AMD64_ABI
7600	#endif // !DEBUG
7601	}
7602	else
7603	{
7604	#if FEATURE_FIXED_OUT_ARGS
7605	baseVarNum = compiler->lvaOutgoingArgSpaceVar;
7606	#else // !FEATURE_FIXED_OUT_ARGS
7607	assert(!"No BaseVarForPutArgStk on x86");
7608	baseVarNum = BAD_VAR_NUM;
7609	#endif // !FEATURE_FIXED_OUT_ARGS
7610	}
7611
7612	return baseVarNum;
7613	}
7614
7615	//---------------------------------------------------------------------
7616	// genAlignStackBeforeCall: Align the stack if necessary before a call.
7617	//
7618	// Arguments:
7619	// putArgStk - the putArgStk node.
7620	//
7621	void CodeGen::genAlignStackBeforeCall(GenTreePutArgStk* putArgStk)
7622	{
7623	#if defined(UNIX_X86_ABI)
7624
7625	genAlignStackBeforeCall(putArgStk->gtCall);
7626
7627	#endif // UNIX_X86_ABI
7628	}
7629
7630	//---------------------------------------------------------------------
7631	// genAlignStackBeforeCall: Align the stack if necessary before a call.
7632	//
7633	// Arguments:
7634	// call - the call node.
7635	//
7636	void CodeGen::genAlignStackBeforeCall(GenTreeCall* call)
7637	{
7638	#if defined(UNIX_X86_ABI)
7639
7640	// Have we aligned the stack yet?
7641	if (!call->fgArgInfo->IsStkAlignmentDone())
7642	{
7643	// We haven't done any stack alignment yet for this call. We might need to create
7644	// an alignment adjustment, even if this function itself doesn't have any stack args.
7645	// This can happen if this function call is part of a nested call sequence, and the outer
7646	// call has already pushed some arguments.
7647
7648	unsigned stkLevel = genStackLevel + call->fgArgInfo->GetStkSizeBytes();
7649	call->fgArgInfo->ComputeStackAlignment(stkLevel);
7650
7651	unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
7652	if (padStkAlign != `0`)
7653	{
7654	// Now generate the alignment
7655	inst_RV_IV(INS_sub, REG_SPBASE, padStkAlign, EA_PTRSIZE);
7656	AddStackLevel(padStkAlign);
7657	AddNestedAlignment(padStkAlign);
7658	}
7659
7660	call->fgArgInfo->SetStkAlignmentDone();
7661	}
7662
7663	#endif // UNIX_X86_ABI
7664	}
7665
7666	//---------------------------------------------------------------------
7667	// genRemoveAlignmentAfterCall: After a call, remove the alignment
7668	// added before the call, if any.
7669	//
7670	// Arguments:
7671	// call - the call node.
7672	// bias - additional stack adjustment
7673	//
7674	// Note:
7675	// When bias > 0, caller should adjust stack level appropriately as
7676	// bias is not considered when adjusting stack level.
7677	//
7678	void CodeGen::genRemoveAlignmentAfterCall(GenTreeCall* call, unsigned bias)
7679	{
7680	#if defined(_TARGET_X86_)
7681	#if defined(UNIX_X86_ABI)
7682	// Put back the stack pointer if there was any padding for stack alignment
7683	unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
7684	unsigned padStkAdjust = padStkAlign + bias;
7685
7686	if (padStkAdjust != `0`)
7687	{
7688	inst_RV_IV(INS_add, REG_SPBASE, padStkAdjust, EA_PTRSIZE);
7689	SubtractStackLevel(padStkAlign);
7690	SubtractNestedAlignment(padStkAlign);
7691	}
7692	#else // UNIX_X86_ABI
7693	if (bias != `0`)
7694	{
7695	genAdjustSP(bias);
7696	}
7697	#endif // !UNIX_X86_ABI_
7698	#else // _TARGET_X86_
7699	assert(bias == `0`);
7700	#endif // !_TARGET_X86
7701	}
7702
7703	#ifdef _TARGET_X86_
7704
7705	//---------------------------------------------------------------------
7706	// genAdjustStackForPutArgStk:
7707	// adjust the stack pointer for a putArgStk node if necessary.
7708	//
7709	// Arguments:
7710	// putArgStk - the putArgStk node.
7711	//
7712	// Returns: true if the stack pointer was adjusted; false otherwise.
7713	//
7714	// Notes:
7715	// Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
7716	// false if the stack arg needs to be stored at the current stack
7717	// pointer address. This is exactly the opposite of the return value
7718	// of this function.
7719	//
7720	bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
7721	{
7722	const unsigned argSize = putArgStk->getArgSize();
7723	GenTree* source = putArgStk->gtGetOp1();
7724
7725	#ifdef FEATURE_SIMD
7726	if (!source->OperIs(GT_FIELD_LIST) && varTypeIsSIMD(source))
7727	{
7728	inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
7729	AddStackLevel(argSize);
7730	m_pushStkArg = false;
7731	return true;
7732	}
7733	#endif // FEATURE_SIMD
7734
7735	// If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack.
7736	// This is set in Lowering, and is true if and only if:
7737	// - This argument contains any GC pointers OR
7738	// - It is a GT_FIELD_LIST OR
7739	// - It is less than 16 bytes in size.
7740	CLANG_FORMAT_COMMENT_ANCHOR;
7741
7742	#ifdef DEBUG
7743	switch (putArgStk->gtPutArgStkKind)
7744	{
7745	case GenTreePutArgStk::Kind::RepInstr:
7746	case GenTreePutArgStk::Kind::Unroll:
7747	assert((putArgStk->gtNumberReferenceSlots == `0`) && (source->OperGet() != GT_FIELD_LIST) && (argSize >= `16`));
7748	break;
7749	case GenTreePutArgStk::Kind::Push:
7750	case GenTreePutArgStk::Kind::PushAllSlots:
7751	assert((putArgStk->gtNumberReferenceSlots != `0`) \|\| (source->OperGet() == GT_FIELD_LIST) \|\| (argSize < `16`));
7752	break;
7753	case GenTreePutArgStk::Kind::Invalid:
7754	default:
7755	assert(!"Uninitialized GenTreePutArgStk::Kind");
7756	break;
7757	}
7758	#endif // DEBUG
7759
7760	if (putArgStk->isPushKind())
7761	{
7762	m_pushStkArg = true;
7763	return false;
7764	}
7765	else
7766	{
7767	m_pushStkArg = false;
7768	inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
7769	AddStackLevel(argSize);
7770	return true;
7771	}
7772	}
7773
7774	//---------------------------------------------------------------------
7775	// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
7776	//
7777	// Arguments
7778	// treeNode - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
7779	//
7780	// Return value:
7781	// None
7782	//
7783	void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
7784	{
7785	GenTreeFieldList* const fieldList = putArgStk->gtOp1->AsFieldList();
7786	assert(fieldList != nullptr);
7787
7788	// Set m_pushStkArg and pre-adjust the stack if necessary.
7789	const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
7790
7791	// For now, we only support the "push" case; we will push a full slot for the first field of each slot
7792	// within the struct.
7793	assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
7794
7795	// If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
7796	// (Note that this mode is not currently being used.)
7797	// If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
7798	// in reverse order, so we start with the current field offset at the size of the struct arg (which must be
7799	// a multiple of the target pointer size).
7800	unsigned currentOffset = (preAdjustedStack) ? `0` : putArgStk->getArgSize();
7801	unsigned prevFieldOffset = currentOffset;
7802	regNumber intTmpReg = REG_NA;
7803	regNumber simdTmpReg = REG_NA;
7804	if (putArgStk->AvailableTempRegCount() != `0`)
7805	{
7806	regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
7807	if ((rsvdRegs & RBM_ALLINT) != `0`)
7808	{
7809	intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT);
7810	assert(genIsValidIntReg(intTmpReg));
7811	}
7812	if ((rsvdRegs & RBM_ALLFLOAT) != `0`)
7813	{
7814	simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT);
7815	assert(genIsValidFloatReg(simdTmpReg));
7816	}
7817	assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? `0` : `1`) + ((simdTmpReg == REG_NA) ? `0` : `1`));
7818	}
7819
7820	for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
7821	{
7822	GenTree* const fieldNode = current->Current();
7823	const unsigned fieldOffset = current->gtFieldOffset;
7824	var_types fieldType = current->gtFieldType;
7825
7826	// Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the
7827	// field list in descending order by offset.
7828	assert(!varTypeIsLong(fieldType));
7829	assert(fieldOffset <= prevFieldOffset);
7830
7831	// Consume the register, if any, for this field. Note that genConsumeRegs() will appropriately
7832	// update the liveness info for a lclVar that has been marked RegOptional, which hasn't been
7833	// assigned a register, and which is therefore contained.
7834	// Unlike genConsumeReg(), it handles the case where no registers are being consumed.
7835	genConsumeRegs(fieldNode);
7836	regNumber argReg = fieldNode->isUsedFromSpillTemp() ? REG_NA : fieldNode->gtRegNum;
7837
7838	// If the field is slot-like, we can use a push instruction to store the entire register no matter the type.
7839	//
7840	// The GC encoder requires that the stack remain 4-byte aligned at all times. Round the adjustment up
7841	// to the next multiple of 4. If we are going to generate a `push` instruction, the adjustment must
7842	// not require rounding.
7843	// NOTE: if the field is of GC type, we must use a push instruction, since the emitter is not otherwise
7844	// able to detect stores into the outgoing argument area of the stack on x86.
7845	const bool fieldIsSlot = ((fieldOffset % `4`) == `0`) && ((prevFieldOffset - fieldOffset) >= `4`);
7846	int adjustment = roundUp(currentOffset - fieldOffset, `4`);
7847	if (fieldIsSlot && !varTypeIsSIMD(fieldType))
7848	{
7849	fieldType = genActualType(fieldType);
7850	unsigned pushSize = genTypeSize(fieldType);
7851	assert((pushSize % `4`) == `0`);
7852	adjustment -= pushSize;
7853	while (adjustment != `0`)
7854	{
7855	inst_IV(INS_push, `0`);
7856	currentOffset -= pushSize;
7857	AddStackLevel(pushSize);
7858	adjustment -= pushSize;
7859	}
7860	m_pushStkArg = true;
7861	}
7862	else
7863	{
7864	m_pushStkArg = false;
7865
7866	// We always "push" floating point fields (i.e. they are full slot values that don't
7867	// require special handling).
7868	assert(varTypeIsIntegralOrI(fieldNode) \|\| varTypeIsSIMD(fieldNode));
7869
7870	// If we can't push this field, it needs to be in a register so that we can store
7871	// it to the stack location.
7872	if (adjustment != `0`)
7873	{
7874	// This moves the stack pointer to fieldOffset.
7875	// For this case, we must adjust the stack and generate stack-relative stores rather than pushes.
7876	// Adjust the stack pointer to the next slot boundary.
7877	inst_RV_IV(INS_sub, REG_SPBASE, adjustment, EA_PTRSIZE);
7878	currentOffset -= adjustment;
7879	AddStackLevel(adjustment);
7880	}
7881
7882	// Does it need to be in a byte register?
7883	// If so, we'll use intTmpReg, which must have been allocated as a byte register.
7884	// If it's already in a register, but not a byteable one, then move it.
7885	if (varTypeIsByte(fieldType) && ((argReg == REG_NA) \|\| ((genRegMask(argReg) & RBM_BYTE_REGS) == `0`)))
7886	{
7887	assert(intTmpReg != REG_NA);
7888	noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != `0`);
7889	if (argReg != REG_NA)
7890	{
7891	inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
7892	argReg = intTmpReg;
7893	}
7894	}
7895	}
7896
7897	if (argReg == REG_NA)
7898	{
7899	if (m_pushStkArg)
7900	{
7901	if (fieldNode->isUsedFromSpillTemp())
7902	{
7903	assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
7904	assert(fieldNode->IsRegOptional());
7905	TempDsc* tmp = getSpillTempDsc(fieldNode);
7906	getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), `0`);
7907	regSet.tmpRlsTemp(tmp);
7908	}
7909	else
7910	{
7911	assert(varTypeIsIntegralOrI(fieldNode));
7912	switch (fieldNode->OperGet())
7913	{
7914	case GT_LCL_VAR:
7915	inst_TT(INS_push, fieldNode, `0`, `0`, emitActualTypeSize(fieldNode->TypeGet()));
7916	break;
7917	case GT_CNS_INT:
7918	if (fieldNode->IsIconHandle())
7919	{
7920	inst_IV_handle(INS_push, fieldNode->gtIntCon.gtIconVal);
7921	}
7922	else
7923	{
7924	inst_IV(INS_push, fieldNode->gtIntCon.gtIconVal);
7925	}
7926	break;
7927	default:
7928	unreached();
7929	}
7930	}
7931	currentOffset -= TARGET_POINTER_SIZE;
7932	AddStackLevel(TARGET_POINTER_SIZE);
7933	}
7934	else
7935	{
7936	// The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
7937	assert(varTypeIsIntegralOrI(fieldNode));
7938	switch (fieldNode->OperGet())
7939	{
7940	case GT_LCL_VAR:
7941	inst_RV_TT(INS_mov, intTmpReg, fieldNode);
7942	break;
7943	case GT_CNS_INT:
7944	genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
7945	break;
7946	default:
7947	unreached();
7948	}
7949	genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
7950	}
7951	}
7952	else
7953	{
7954	#if defined(FEATURE_SIMD)
7955	if (fieldType == TYP_SIMD12)
7956	{
7957	assert(genIsValidFloatReg(simdTmpReg));
7958	genStoreSIMD12ToStack(argReg, simdTmpReg);
7959	}
7960	else
7961	#endif // defined(FEATURE_SIMD)
7962	{
7963	genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
7964	}
7965	if (m_pushStkArg)
7966	{
7967	// We always push a slot-rounded size
7968	currentOffset -= genTypeSize(fieldType);
7969	}
7970	}
7971
7972	prevFieldOffset = fieldOffset;
7973	}
7974	if (currentOffset != `0`)
7975	{
7976	// We don't expect padding at the beginning of a struct, but it could happen with explicit layout.
7977	inst_RV_IV(INS_sub, REG_SPBASE, currentOffset, EA_PTRSIZE);
7978	AddStackLevel(currentOffset);
7979	}
7980	}
7981	#endif // _TARGET_X86_
7982
7983	//---------------------------------------------------------------------
7984	// genPutArgStk - generate code for passing an arg on the stack.
7985	//
7986	// Arguments
7987	// treeNode - the GT_PUTARG_STK node
7988	// targetType - the type of the treeNode
7989	//
7990	// Return value:
7991	// None
7992	//
7993	void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
7994	{
7995	GenTree* data = putArgStk->gtOp1;
7996	var_types targetType = genActualType(data->TypeGet());
7997
7998	#ifdef _TARGET_X86_
7999
8000	genAlignStackBeforeCall(putArgStk);
8001
8002	if ((data->OperGet() != GT_FIELD_LIST) && varTypeIsStruct(targetType))
8003	{
8004	(void)genAdjustStackForPutArgStk(putArgStk);
8005	genPutStructArgStk(putArgStk);
8006	return;
8007	}
8008
8009	// On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
8010	assert(targetType != TYP_LONG);
8011
8012	const unsigned argSize = putArgStk->getArgSize();
8013	assert((argSize % TARGET_POINTER_SIZE) == `0`);
8014
8015	if (data->isContainedIntOrIImmed())
8016	{
8017	if (data->IsIconHandle())
8018	{
8019	inst_IV_handle(INS_push, data->gtIntCon.gtIconVal);
8020	}
8021	else
8022	{
8023	inst_IV(INS_push, data->gtIntCon.gtIconVal);
8024	}
8025	AddStackLevel(argSize);
8026	}
8027	else if (data->OperGet() == GT_FIELD_LIST)
8028	{
8029	genPutArgStkFieldList(putArgStk);
8030	}
8031	else
8032	{
8033	// We should not see any contained nodes that are not immediates.
8034	assert(data->isUsedFromReg());
8035	genConsumeReg(data);
8036	genPushReg(targetType, data->gtRegNum);
8037	}
8038	#else // !_TARGET_X86_
8039	{
8040	unsigned baseVarNum = getBaseVarForPutArgStk(putArgStk);
8041
8042	#ifdef UNIX_AMD64_ABI
8043
8044	if (data->OperIs(GT_FIELD_LIST))
8045	{
8046	genPutArgStkFieldList(putArgStk, baseVarNum);
8047	return;
8048	}
8049	else if (varTypeIsStruct(targetType))
8050	{
8051	m_stkArgVarNum = baseVarNum;
8052	m_stkArgOffset = putArgStk->getArgOffset();
8053	genPutStructArgStk(putArgStk);
8054	m_stkArgVarNum = BAD_VAR_NUM;
8055	return;
8056	}
8057	#endif // UNIX_AMD64_ABI
8058
8059	noway_assert(targetType != TYP_STRUCT);
8060
8061	// Get argument offset on stack.
8062	// Here we cross check that argument offset hasn't changed from lowering to codegen since
8063	// we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
8064	int argOffset = putArgStk->getArgOffset();
8065
8066	#ifdef DEBUG
8067	fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(putArgStk->gtCall, putArgStk);
8068	assert(curArgTabEntry);
8069	assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
8070	#endif
8071
8072	if (data->isContainedIntOrIImmed())
8073	{
8074	getEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), baseVarNum, argOffset,
8075	(int)data->AsIntConCommon()->IconValue());
8076	}
8077	else
8078	{
8079	assert(data->isUsedFromReg());
8080	genConsumeReg(data);
8081	getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, baseVarNum,
8082	argOffset);
8083	}
8084	}
8085	#endif // !_TARGET_X86_
8086	}
8087
8088	//---------------------------------------------------------------------
8089	// genPutArgReg - generate code for a GT_PUTARG_REG node
8090	//
8091	// Arguments
8092	// tree - the GT_PUTARG_REG node
8093	//
8094	// Return value:
8095	// None
8096	//
8097	void CodeGen::genPutArgReg(GenTreeOp* tree)
8098	{
8099	assert(tree->OperIs(GT_PUTARG_REG));
8100
8101	var_types targetType = tree->TypeGet();
8102	regNumber targetReg = tree->gtRegNum;
8103
8104	#ifndef UNIX_AMD64_ABI
8105	assert(targetType != TYP_STRUCT);
8106	#endif // !UNIX_AMD64_ABI
8107
8108	GenTree* op1 = tree->gtOp1;
8109	genConsumeReg(op1);
8110
8111	// If child node is not already in the register we need, move it
8112	if (targetReg != op1->gtRegNum)
8113	{
8114	inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
8115	}
8116
8117	genProduceReg(tree);
8118	}
8119
8120	#ifdef _TARGET_X86_
8121	// genPushReg: Push a register value onto the stack and adjust the stack level
8122	//
8123	// Arguments:
8124	// type - the type of value to be stored
8125	// reg - the register containing the value
8126	//
8127	// Notes:
8128	// For TYP_LONG, the srcReg must be a floating point register.
8129	// Otherwise, the register type must be consistent with the given type.
8130	//
8131	void CodeGen::genPushReg(var_types type, regNumber srcReg)
8132	{
8133	unsigned size = genTypeSize(type);
8134	if (varTypeIsIntegralOrI(type) && type != TYP_LONG)
8135	{
8136	assert(genIsValidIntReg(srcReg));
8137	inst_RV(INS_push, srcReg, type);
8138	}
8139	else
8140	{
8141	instruction ins;
8142	emitAttr attr = emitTypeSize(type);
8143	if (type == TYP_LONG)
8144	{
8145	// On x86, the only way we can push a TYP_LONG from a register is if it is in an xmm reg.
8146	// This is only used when we are pushing a struct from memory to memory, and basically is
8147	// handling an 8-byte "chunk", as opposed to strictly a long type.
8148	ins = INS_movq;
8149	}
8150	else
8151	{
8152	ins = ins_Store(type);
8153	}
8154	assert(genIsValidFloatReg(srcReg));
8155	inst_RV_IV(INS_sub, REG_SPBASE, size, EA_PTRSIZE);
8156	getEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, `0`);
8157	}
8158	AddStackLevel(size);
8159	}
8160	#endif // _TARGET_X86_
8161
8162	#if defined(FEATURE_PUT_STRUCT_ARG_STK)
8163	// genStoreRegToStackArg: Store a register value into the stack argument area
8164	//
8165	// Arguments:
8166	// type - the type of value to be stored
8167	// reg - the register containing the value
8168	// offset - the offset from the base (see Assumptions below)
8169	//
8170	// Notes:
8171	// A type of TYP_STRUCT instructs this method to store a 16-byte chunk
8172	// at the given offset (i.e. not the full struct).
8173	//
8174	// Assumptions:
8175	// The caller must set the context appropriately before calling this method:
8176	// - On x64, m_stkArgVarNum must be set according to whether this is a regular or tail call.
8177	// - On x86, the caller must set m_pushStkArg if this method should push the argument.
8178	// Otherwise, the argument is stored at the given offset from sp.
8179	//
8180	// TODO: In the below code the load and store instructions are for 16 bytes, but the
8181	// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
8182	// this probably needs to be changed.
8183	//
8184	void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset)
8185	{
8186	assert(srcReg != REG_NA);
8187	instruction ins;
8188	emitAttr attr;
8189	unsigned size;
8190
8191	if (type == TYP_STRUCT)
8192	{
8193	ins = INS_movdqu;
8194	// This should be changed!
8195	attr = EA_8BYTE;
8196	size = `16`;
8197	}
8198	else
8199	{
8200	#ifdef FEATURE_SIMD
8201	if (varTypeIsSIMD(type))
8202	{
8203	assert(genIsValidFloatReg(srcReg));
8204	ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
8205	}
8206	else
8207	#endif // FEATURE_SIMD
8208	#ifdef _TARGET_X86_
8209	if (type == TYP_LONG)
8210	{
8211	assert(genIsValidFloatReg(srcReg));
8212	ins = INS_movq;
8213	}
8214	else
8215	#endif // _TARGET_X86_
8216	{
8217	assert((varTypeIsFloating(type) && genIsValidFloatReg(srcReg)) \|\|
8218	(varTypeIsIntegralOrI(type) && genIsValidIntReg(srcReg)));
8219	ins = ins_Store(type);
8220	}
8221	attr = emitTypeSize(type);
8222	size = genTypeSize(type);
8223	}
8224
8225	#ifdef _TARGET_X86_
8226	if (m_pushStkArg)
8227	{
8228	genPushReg(type, srcReg);
8229	}
8230	else
8231	{
8232	getEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset);
8233	}
8234	#else // !_TARGET_X86_
8235	assert(m_stkArgVarNum != BAD_VAR_NUM);
8236	getEmitter()->emitIns_S_R(ins, attr, srcReg, m_stkArgVarNum, m_stkArgOffset + offset);
8237	#endif // !_TARGET_X86_
8238	}
8239
8240	//---------------------------------------------------------------------
8241	// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
8242	// In case there are references to heap object in the struct,
8243	// it generates the gcinfo as well.
8244	//
8245	// Arguments
8246	// putArgStk - the GT_PUTARG_STK node
8247	//
8248	// Notes:
8249	// In the case of fixed out args, the caller must have set m_stkArgVarNum to the variable number
8250	// corresponding to the argument area (where we will put the argument on the stack).
8251	// For tail calls this is the baseVarNum = 0.
8252	// For non tail calls this is the outgoingArgSpace.
8253	void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
8254	{
8255	GenTree* source = putArgStk->gtGetOp1();
8256	var_types targetType = source->TypeGet();
8257
8258	#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
8259	if (putArgStk->isSIMD12())
8260	{
8261	genPutArgStkSIMD12(putArgStk);
8262	return;
8263	}
8264	#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
8265
8266	if (varTypeIsSIMD(targetType))
8267	{
8268	regNumber srcReg = genConsumeReg(source);
8269	assert((srcReg != REG_NA) && (genIsValidFloatReg(srcReg)));
8270	genStoreRegToStackArg(targetType, srcReg, `0`);
8271	return;
8272	}
8273
8274	assert(targetType == TYP_STRUCT);
8275
8276	if (putArgStk->gtNumberReferenceSlots == `0`)
8277	{
8278	switch (putArgStk->gtPutArgStkKind)
8279	{
8280	case GenTreePutArgStk::Kind::RepInstr:
8281	genStructPutArgRepMovs(putArgStk);
8282	break;
8283	case GenTreePutArgStk::Kind::Unroll:
8284	genStructPutArgUnroll(putArgStk);
8285	break;
8286	case GenTreePutArgStk::Kind::Push:
8287	genStructPutArgUnroll(putArgStk);
8288	break;
8289	default:
8290	unreached();
8291	}
8292	}
8293	else
8294	{
8295	// No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
8296	CLANG_FORMAT_COMMENT_ANCHOR;
8297
8298	#ifdef _TARGET_X86_
8299	// On x86, any struct that has contains GC references must be stored to the stack using `push` instructions so
8300	// that the emitter properly detects the need to update the method's GC information.
8301	//
8302	// Strictly speaking, it is only necessary to use `push` to store the GC references themselves, so for structs
8303	// with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the
8304	// future.
8305	assert(m_pushStkArg);
8306
8307	GenTree* srcAddr = source->gtGetOp1();
8308	BYTE* gcPtrs = putArgStk->gtGcPtrs;
8309	const unsigned numSlots = putArgStk->gtNumSlots;
8310
8311	regNumber srcRegNum = srcAddr->gtRegNum;
8312	const bool srcAddrInReg = srcRegNum != REG_NA;
8313
8314	unsigned srcLclNum = `0`;
8315	unsigned srcLclOffset = `0`;
8316	if (srcAddrInReg)
8317	{
8318	genConsumeReg(srcAddr);
8319	}
8320	else
8321	{
8322	assert(srcAddr->OperIsLocalAddr());
8323
8324	srcLclNum = srcAddr->AsLclVarCommon()->gtLclNum;
8325	if (srcAddr->OperGet() == GT_LCL_FLD_ADDR)
8326	{
8327	srcLclOffset = srcAddr->AsLclFld()->gtLclOffs;
8328	}
8329	}
8330
8331	for (int i = numSlots - `1`; i >= `0`; --i)
8332	{
8333	emitAttr slotAttr;
8334	if (gcPtrs[i] == TYPE_GC_NONE)
8335	{
8336	slotAttr = EA_4BYTE;
8337	}
8338	else if (gcPtrs[i] == TYPE_GC_REF)
8339	{
8340	slotAttr = EA_GCREF;
8341	}
8342	else
8343	{
8344	assert(gcPtrs[i] == TYPE_GC_BYREF);
8345	slotAttr = EA_BYREF;
8346	}
8347
8348	const unsigned offset = i * TARGET_POINTER_SIZE;
8349	if (srcAddrInReg)
8350	{
8351	getEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, offset);
8352	}
8353	else
8354	{
8355	getEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + offset);
8356	}
8357	AddStackLevel(TARGET_POINTER_SIZE);
8358	}
8359	#else // !defined(_TARGET_X86_)
8360
8361	// Consume these registers.
8362	// They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
8363	genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA);
8364
8365	const bool srcIsLocal = putArgStk->gtOp1->AsObj()->gtOp1->OperIsLocalAddr();
8366	const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF;
8367
8368	#if DEBUG
8369	unsigned numGCSlotsCopied = `0`;
8370	#endif // DEBUG
8371
8372	BYTE* gcPtrs = putArgStk->gtGcPtrs;
8373	const unsigned numSlots = putArgStk->gtNumSlots;
8374	for (unsigned i = `0`; i < numSlots;)
8375	{
8376	if (gcPtrs[i] == TYPE_GC_NONE)
8377	{
8378	// Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively)
8379	// instead of a sequence of movsp instructions to save cycles and code size.
8380	unsigned adjacentNonGCSlotCount = `0`;
8381	do
8382	{
8383	adjacentNonGCSlotCount++;
8384	i++;
8385	} while ((i < numSlots) && (gcPtrs[i] == TYPE_GC_NONE));
8386
8387	// If we have a very small contiguous non-ref region, it's better just to
8388	// emit a sequence of movsp instructions
8389	if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
8390	{
8391	for (; adjacentNonGCSlotCount > `0`; adjacentNonGCSlotCount--)
8392	{
8393	instGen(INS_movsp);
8394	}
8395	}
8396	else
8397	{
8398	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount);
8399	instGen(INS_r_movsp);
8400	}
8401	}
8402	else
8403	{
8404	assert((gcPtrs[i] == TYPE_GC_REF) \|\| (gcPtrs[i] == TYPE_GC_BYREF));
8405
8406	// We have a GC (byref or ref) pointer
8407	// TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction,
8408	// but the logic for emitting a GC info record is not available (it is internal for the emitter
8409	// only.) See emitGCVarLiveUpd function. If we could call it separately, we could do
8410	// instGen(INS_movsp); and emission of gc info.
8411
8412	var_types memType = (gcPtrs[i] == TYPE_GC_REF) ? TYP_REF : TYP_BYREF;
8413	getEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, `0`);
8414	genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE);
8415	#ifdef DEBUG
8416	numGCSlotsCopied++;
8417	#endif // DEBUG
8418
8419	i++;
8420	if (i < numSlots)
8421	{
8422	// Source for the copy operation.
8423	// If a LocalAddr, use EA_PTRSIZE - copy from stack.
8424	// If not a LocalAddr, use EA_BYREF - the source location is not on the stack.
8425	getEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE);
8426
8427	// Always copying to the stack - outgoing arg area
8428	// (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE.
8429	getEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE);
8430	}
8431	}
8432	}
8433
8434	assert(numGCSlotsCopied == putArgStk->gtNumberReferenceSlots);
8435	#endif // _TARGET_X86_
8436	}
8437	}
8438	#endif // defined(FEATURE_PUT_STRUCT_ARG_STK)
8439
8440	/*****************************************************************************
8441	*
8442	* Create and record GC Info for the function.
8443	*/
8444	#ifndef JIT32_GCENCODER
8445	void
8446	#else // !JIT32_GCENCODER
8447	void*
8448	#endif // !JIT32_GCENCODER
8449	CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
8450	{
8451	#ifdef JIT32_GCENCODER
8452	return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
8453	#else // !JIT32_GCENCODER
8454	genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
8455	#endif // !JIT32_GCENCODER
8456	}
8457
8458	#ifdef JIT32_GCENCODER
8459	void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
8460	unsigned prologSize,
8461	unsigned epilogSize DEBUGARG(void* codePtr))
8462	{
8463	BYTE headerBuf[`64`];
8464	InfoHdr header;
8465
8466	int s_cached;
8467
8468	#ifdef WIN64EXCEPTIONS
8469	// We should do this before gcInfoBlockHdrSave since varPtrTableSize must be finalized before it
8470	if (compiler->ehAnyFunclets())
8471	{
8472	gcInfo.gcMarkFilterVarsPinned();
8473	}
8474	#endif
8475
8476	#ifdef DEBUG
8477	size_t headerSize =
8478	#endif
8479	compiler->compInfoBlkSize =
8480	gcInfo.gcInfoBlockHdrSave(headerBuf, `0`, codeSize, prologSize, epilogSize, &header, &s_cached);
8481
8482	size_t argTabOffset = `0`;
8483	size_t ptrMapSize = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
8484
8485	#if DISPLAY_SIZES
8486
8487	if (genInterruptible)
8488	{
8489	gcHeaderISize += compiler->compInfoBlkSize;
8490	gcPtrMapISize += ptrMapSize;
8491	}
8492	else
8493	{
8494	gcHeaderNSize += compiler->compInfoBlkSize;
8495	gcPtrMapNSize += ptrMapSize;
8496	}
8497
8498	#endif // DISPLAY_SIZES
8499
8500	compiler->compInfoBlkSize += ptrMapSize;
8501
8502	/ Allocate the info block for the method /
8503
8504	compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
8505
8506	#if 0 // VERBOSE_SIZES
8507	// TODO-X86-Cleanup: 'dataSize', below, is not defined
8508
8509	// if (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
8510	{
8511	printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
8512	compiler->info.compILCodeSize,
8513	compiler->compInfoBlkSize,
8514	codeSize + dataSize,
8515	codeSize + dataSize - prologSize - epilogSize,
8516	`100` * (codeSize + dataSize) / compiler->info.compILCodeSize,
8517	`100` * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
8518	compiler->info.compClassName,
8519	compiler->info.compMethodName);
8520	}
8521
8522	#endif
8523
8524	/ Fill in the info block and return it to the caller /
8525
8526	void* infoPtr = compiler->compInfoBlkAddr;
8527
8528	/ Create the method info block: header followed by GC tracking tables /
8529
8530	compiler->compInfoBlkAddr +=
8531	gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -`1`, codeSize, prologSize, epilogSize, &header, &s_cached);
8532
8533	assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
8534	compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
8535	assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
8536
8537	#ifdef DEBUG
8538
8539	if (`0`)
8540	{
8541	BYTE* temp = (BYTE*)infoPtr;
8542	unsigned size = compiler->compInfoBlkAddr - temp;
8543	BYTE* ptab = temp + headerSize;
8544
8545	noway_assert(size == headerSize + ptrMapSize);
8546
8547	printf("Method info block - header [%u bytes]:", headerSize);
8548
8549	for (unsigned i = `0`; i < size; i++)
8550	{
8551	if (temp == ptab)
8552	{
8553	printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
8554	printf("\n %04X: %c", i & ~`0xF`, `3` (i & `0xF`), `' '`);
8555	}
8556	else
8557	{
8558	if (!(i % `16`))
8559	printf("\n %04X: ", i);
8560	}
8561
8562	printf("%02X ", *temp++);
8563	}
8564
8565	printf("\n");
8566	}
8567
8568	#endif // DEBUG
8569
8570	#if DUMP_GC_TABLES
8571
8572	if (compiler->opts.dspGCtbls)
8573	{
8574	const BYTE* base = (BYTE*)infoPtr;
8575	unsigned size;
8576	unsigned methodSize;
8577	InfoHdr dumpHeader;
8578
8579	printf("GC Info for method %s\n", compiler->info.compFullName);
8580	printf("GC info size = %3u\n", compiler->compInfoBlkSize);
8581
8582	size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
8583	// printf("size of header encoding is %3u\n", size);
8584	printf("\n");
8585
8586	if (compiler->opts.dspGCtbls)
8587	{
8588	base += size;
8589	size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
8590	// printf("size of pointer table is %3u\n", size);
8591	printf("\n");
8592	noway_assert(compiler->compInfoBlkAddr == (base + size));
8593	}
8594	}
8595
8596	#ifdef DEBUG
8597	if (jitOpts.testMask & `128`)
8598	{
8599	for (unsigned offs = `0`; offs < codeSize; offs++)
8600	{
8601	gcInfo.gcFindPtrsInFrame(infoPtr, codePtr, offs);
8602	}
8603	}
8604	#endif // DEBUG
8605	#endif // DUMP_GC_TABLES
8606
8607	/ Make sure we ended up generating the expected number of bytes /
8608
8609	noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
8610
8611	return infoPtr;
8612	}
8613
8614	#else // !JIT32_GCENCODER
8615	void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
8616	{
8617	IAllocator* allowZeroAlloc = new (compiler, CMK_GC) CompIAllocator (compiler->getAllocatorGC());
8618	GcInfoEncoder* gcInfoEncoder = new (compiler, CMK_GC)
8619	GcInfoEncoder (compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
8620	assert(gcInfoEncoder);
8621
8622	// Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
8623	gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
8624
8625	// We keep the call count for the second call to gcMakeRegPtrTable() below.
8626	unsigned callCnt = `0`;
8627	// First we figure out the encoder ID's for the stack slots and registers.
8628	gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS, &callCnt);
8629	// Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
8630	gcInfoEncoder->FinalizeSlotIds();
8631	// Now we can actually use those slot ID's to declare live ranges.
8632	gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK, &callCnt);
8633
8634	if (compiler->opts.compDbgEnC)
8635	{
8636	// what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
8637	// which is:
8638	// -return address
8639	// -saved off RBP
8640	// -saved 'this' pointer and bool for synchronized methods
8641
8642	// 4 slots for RBP + return address + RSI + RDI
8643	int preservedAreaSize = `4` * REGSIZE_BYTES;
8644
8645	if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
8646	{
8647	if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
8648	{
8649	preservedAreaSize += REGSIZE_BYTES;
8650	}
8651
8652	// bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
8653	preservedAreaSize += `4`;
8654	}
8655
8656	// Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
8657	// frame
8658	gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
8659	}
8660
8661	if (compiler->opts.IsReversePInvoke())
8662	{
8663	unsigned reversePInvokeFrameVarNumber = compiler->lvaReversePInvokeFrameVar;
8664	assert(reversePInvokeFrameVarNumber != BAD_VAR_NUM && reversePInvokeFrameVarNumber < compiler->lvaRefCount);
8665	LclVarDsc& reversePInvokeFrameVar = compiler->lvaTable[reversePInvokeFrameVarNumber];
8666	gcInfoEncoder->SetReversePInvokeFrameSlot(reversePInvokeFrameVar.lvStkOffs);
8667	}
8668
8669	gcInfoEncoder->Build();
8670
8671	// GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
8672	// let's save the values anyway for debugging purposes
8673	compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
8674	compiler->compInfoBlkSize = `0`; // not exposed by the GCEncoder interface
8675	}
8676	#endif // !JIT32_GCENCODER
8677
8678	/*****************************************************************************
8679	* Emit a call to a helper function.
8680	*
8681	*/
8682
8683	void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
8684	{
8685	void* addr = nullptr;
8686	void* pAddr = nullptr;
8687
8688	emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
8689	addr = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
8690	regNumber callTarget = REG_NA;
8691	regMaskTP killMask = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
8692
8693	if (!addr)
8694	{
8695	assert(pAddr != nullptr);
8696
8697	// Absolute indirect call addr
8698	// Note: Order of checks is important. First always check for pc-relative and next
8699	// zero-relative. Because the former encoding is 1-byte smaller than the latter.
8700	if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) \|\|
8701	genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
8702	{
8703	// generate call whose target is specified by 32-bit offset relative to PC or zero.
8704	callType = emitter::EC_FUNC_TOKEN_INDIR;
8705	addr = pAddr;
8706	}
8707	else
8708	{
8709	#ifdef _TARGET_AMD64_
8710	// If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
8711	// load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
8712	// make the call.
8713	// mov reg, addr
8714	// call [reg]
8715
8716	if (callTargetReg == REG_NA)
8717	{
8718	// If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
8719	// this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
8720	callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET;
8721	regMaskTP callTargetMask = genRegMask(callTargetReg);
8722	noway_assert((callTargetMask & killMask) == callTargetMask);
8723	}
8724	else
8725	{
8726	// The call target must not overwrite any live variable, though it may not be in the
8727	// kill set for the call.
8728	regMaskTP callTargetMask = genRegMask(callTargetReg);
8729	noway_assert((callTargetMask & regSet.rsMaskVars) == RBM_NONE);
8730	}
8731	#endif
8732
8733	callTarget = callTargetReg;
8734	CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
8735	callType = emitter::EC_INDIR_ARD;
8736	}
8737	}
8738
8739	// clang-format off
8740	getEmitter()->emitIns_Call(callType,
8741	compiler->eeFindHelper(helper),
8742	INDEBUG_LDISASM_COMMA(nullptr) addr,
8743	argSize,
8744	retSize
8745	MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(EA_UNKNOWN),
8746	gcInfo.gcVarPtrSetCur,
8747	gcInfo.gcRegGCrefSetCur,
8748	gcInfo.gcRegByrefSetCur,
8749	BAD_IL_OFFSET, // IL offset
8750	callTarget, // ireg
8751	REG_NA, `0`, `0`, // xreg, xmul, disp
8752	false // isJump
8753	);
8754	// clang-format on
8755
8756	regSet.verifyRegistersUsed(killMask);
8757	}
8758
8759	/*****************************************************************************
8760	* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
8761	* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
8762	* disassembler thinks the instructions as the same as we do.
8763	*/
8764
8765	// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
8766	// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
8767	//#define ALL_XARCH_EMITTER_UNIT_TESTS
8768
8769	#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
8770	void CodeGen::genAmd64EmitterUnitTests()
8771	{
8772	if (!verbose)
8773	{
8774	return;
8775	}
8776
8777	if (!compiler->opts.altJit)
8778	{
8779	// No point doing this in a "real" JIT.
8780	return;
8781	}
8782
8783	// Mark the "fake" instructions in the output.
8784	printf("*************** In genAmd64EmitterUnitTests()\n");
8785
8786	// We use this:
8787	// genDefineTempLabel(genCreateTempLabel());
8788	// to create artificial labels to help separate groups of tests.
8789
8790	//
8791	// Loads
8792	//
8793	CLANG_FORMAT_COMMENT_ANCHOR;
8794
8795	#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
8796	genDefineTempLabel(genCreateTempLabel());
8797
8798	// vhaddpd ymm0,ymm1,ymm2
8799	getEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8800	// vaddss xmm0,xmm1,xmm2
8801	getEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8802	// vaddsd xmm0,xmm1,xmm2
8803	getEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8804	// vaddps xmm0,xmm1,xmm2
8805	getEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8806	// vaddps ymm0,ymm1,ymm2
8807	getEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8808	// vaddpd xmm0,xmm1,xmm2
8809	getEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8810	// vaddpd ymm0,ymm1,ymm2
8811	getEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8812	// vsubss xmm0,xmm1,xmm2
8813	getEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8814	// vsubsd xmm0,xmm1,xmm2
8815	getEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8816	// vsubps ymm0,ymm1,ymm2
8817	getEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8818	// vsubps ymm0,ymm1,ymm2
8819	getEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8820	// vsubpd xmm0,xmm1,xmm2
8821	getEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8822	// vsubpd ymm0,ymm1,ymm2
8823	getEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8824	// vmulss xmm0,xmm1,xmm2
8825	getEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8826	// vmulsd xmm0,xmm1,xmm2
8827	getEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8828	// vmulps xmm0,xmm1,xmm2
8829	getEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8830	// vmulpd xmm0,xmm1,xmm2
8831	getEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8832	// vmulps ymm0,ymm1,ymm2
8833	getEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8834	// vmulpd ymm0,ymm1,ymm2
8835	getEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8836	// vandps xmm0,xmm1,xmm2
8837	getEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8838	// vandpd xmm0,xmm1,xmm2
8839	getEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8840	// vandps ymm0,ymm1,ymm2
8841	getEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8842	// vandpd ymm0,ymm1,ymm2
8843	getEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8844	// vorps xmm0,xmm1,xmm2
8845	getEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8846	// vorpd xmm0,xmm1,xmm2
8847	getEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8848	// vorps ymm0,ymm1,ymm2
8849	getEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8850	// vorpd ymm0,ymm1,ymm2
8851	getEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8852	// vdivss xmm0,xmm1,xmm2
8853	getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8854	// vdivsd xmm0,xmm1,xmm2
8855	getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8856	// vdivss xmm0,xmm1,xmm2
8857	getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8858	// vdivsd xmm0,xmm1,xmm2
8859	getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8860
8861	// vdivss xmm0,xmm1,xmm2
8862	getEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8863	// vdivsd xmm0,xmm1,xmm2
8864	getEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
8865	#endif // ALL_XARCH_EMITTER_UNIT_TESTS
8866	printf("*************** End of genAmd64EmitterUnitTests()\n");
8867	}
8868
8869	#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
8870
8871	#endif // _TARGET_AMD64_
8872

Browse the source code of CoreCLR/jit/codegenxarch.cpp