1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Intel hardware intrinsic Code Generator XX
9XX XX
10XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12*/
13#include "jitpch.h"
14#ifdef _MSC_VER
15#pragma hdrstop
16#endif
17
18#ifdef FEATURE_HW_INTRINSICS
19
20#include "emit.h"
21#include "codegen.h"
22#include "sideeffects.h"
23#include "lower.h"
24#include "gcinfo.h"
25#include "gcinfoencoder.h"
26
27//------------------------------------------------------------------------
28// assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29//
30// Arguments:
31// lowering - The lowering phase from the compiler
32// node - The HWIntrinsic node that has the contained node
33// op - The op that is contained
34//
35static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36{
37#if DEBUG
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40 //
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43 // spillage
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45 // register
46 // in the first place).
47
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
51#endif // DEBUG
52}
53
54//------------------------------------------------------------------------
55// genIsTableDrivenHWIntrinsic:
56//
57// Arguments:
58// category - category of a HW intrinsic
59//
60// Return Value:
61// returns true if this category can be table-driven in CodeGen
62//
63static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64{
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
72}
73
74//------------------------------------------------------------------------
75// genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76//
77// Arguments:
78// node - The hardware intrinsic node
79//
80void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81{
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
87
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91 {
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types targetType = node->TypeGet();
96 var_types baseType = node->gtSIMDBaseType;
97
98 regNumber op1Reg = REG_NA;
99 regNumber op2Reg = REG_NA;
100 emitter* emit = getEmitter();
101
102 assert(numArgs >= 0);
103 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104 assert(ins != INS_invalid);
105 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106 assert(simdSize != 0);
107
108 switch (numArgs)
109 {
110 case 1:
111 {
112 genConsumeOperands(node);
113 op1Reg = op1->gtRegNum;
114
115 if (node->OperIsMemoryLoad())
116 {
117 emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
118 }
119 else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
120 {
121 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
122 }
123 else if ((ival != -1) && varTypeIsFloating(baseType))
124 {
125 assert((ival >= 0) && (ival <= 127));
126 genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
127 }
128 else
129 {
130 genHWIntrinsic_R_RM(node, ins, simdSize);
131 }
132 break;
133 }
134
135 case 2:
136 {
137 genConsumeOperands(node);
138
139 op1Reg = op1->gtRegNum;
140 op2Reg = op2->gtRegNum;
141
142 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
143 {
144 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
145 //
146 // For non-commutative intrinsics, we should have ensured that op2 was marked
147 // delay free in order to prevent it from getting assigned the same register
148 // as target. However, for commutative intrinsics, we can just swap the operands
149 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
150
151 noway_assert(node->OperIsCommutative());
152 op2Reg = op1Reg;
153 op1Reg = targetReg;
154 }
155
156 if (category == HW_Category_MemoryStore)
157 {
158 emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
159 }
160 else if ((ival != -1) && varTypeIsFloating(baseType))
161 {
162 assert((ival >= 0) && (ival <= 127));
163 genHWIntrinsic_R_R_RM_I(node, ins, ival);
164 }
165 else if (category == HW_Category_MemoryLoad)
166 {
167 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
168 {
169 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
170 }
171 else
172 {
173 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
174 }
175 }
176 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
177 {
178 assert(ival == -1);
179
180 if (intrinsicId == NI_SSE2_Extract)
181 {
182 // extract instructions return to GP-registers, so it needs int size as the emitsize
183 simdSize = emitTypeSize(TYP_INT);
184 }
185
186 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
187
188 if (op2->IsCnsIntOrI())
189 {
190 ssize_t ival = op2->AsIntCon()->IconValue();
191 assert((ival >= 0) && (ival <= 255));
192 emitSwCase((int8_t)ival);
193 }
194 else
195 {
196 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
197 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
198 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
199 regNumber baseReg = node->ExtractTempReg();
200 regNumber offsReg = node->GetSingleTempReg();
201 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
202 }
203 }
204 else
205 {
206 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
207 }
208 break;
209 }
210
211 case 3:
212 {
213 assert(op1->OperIsList());
214 assert(op1->gtGetOp2()->OperIsList());
215 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
216
217 GenTreeArgList* argList = op1->AsArgList();
218 op1 = argList->Current();
219 genConsumeRegs(op1);
220 op1Reg = op1->gtRegNum;
221
222 argList = argList->Rest();
223 op2 = argList->Current();
224 genConsumeRegs(op2);
225 op2Reg = op2->gtRegNum;
226
227 argList = argList->Rest();
228 GenTree* op3 = argList->Current();
229 genConsumeRegs(op3);
230 regNumber op3Reg = op3->gtRegNum;
231
232 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
233 {
234 assert(ival == -1);
235
236 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
237
238 if (op3->IsCnsIntOrI())
239 {
240 ssize_t ival = op3->AsIntCon()->IconValue();
241 assert((ival >= 0) && (ival <= 255));
242 emitSwCase((int8_t)ival);
243 }
244 else
245 {
246 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
247 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
248 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
249 regNumber baseReg = node->ExtractTempReg();
250 regNumber offsReg = node->GetSingleTempReg();
251 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
252 }
253 }
254 else if (category == HW_Category_MemoryStore)
255 {
256 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
257 {
258 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
259 }
260 else
261 {
262 assert(intrinsicId == NI_SSE2_MaskMove);
263 assert(targetReg == REG_NA);
264
265 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
266 if (op3Reg != REG_EDI)
267 {
268 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
269 }
270 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
271 }
272 }
273 else
274 {
275 switch (intrinsicId)
276 {
277 case NI_SSE41_BlendVariable:
278 case NI_AVX_BlendVariable:
279 case NI_AVX2_BlendVariable:
280 {
281 genHWIntrinsic_R_R_RM_R(node, ins);
282 break;
283 }
284
285 default:
286 {
287 unreached();
288 break;
289 };
290 }
291 }
292 break;
293 }
294
295 default:
296 unreached();
297 break;
298 }
299 genProduceReg(node);
300 return;
301 }
302
303 switch (isa)
304 {
305 case InstructionSet_Base:
306 genBaseIntrinsic(node);
307 break;
308 case InstructionSet_SSE:
309 case InstructionSet_SSE_X64:
310 genSSEIntrinsic(node);
311 break;
312 case InstructionSet_SSE2:
313 case InstructionSet_SSE2_X64:
314 genSSE2Intrinsic(node);
315 break;
316 case InstructionSet_SSE41:
317 case InstructionSet_SSE41_X64:
318 genSSE41Intrinsic(node);
319 break;
320 case InstructionSet_SSE42:
321 case InstructionSet_SSE42_X64:
322 genSSE42Intrinsic(node);
323 break;
324 case InstructionSet_AVX:
325 case InstructionSet_AVX2:
326 genAvxOrAvx2Intrinsic(node);
327 break;
328 case InstructionSet_AES:
329 genAESIntrinsic(node);
330 break;
331 case InstructionSet_BMI1:
332 case InstructionSet_BMI1_X64:
333 case InstructionSet_BMI2:
334 case InstructionSet_BMI2_X64:
335 genBMI1OrBMI2Intrinsic(node);
336 break;
337 case InstructionSet_FMA:
338 genFMAIntrinsic(node);
339 break;
340 case InstructionSet_LZCNT:
341 case InstructionSet_LZCNT_X64:
342 genLZCNTIntrinsic(node);
343 break;
344 case InstructionSet_PCLMULQDQ:
345 genPCLMULQDQIntrinsic(node);
346 break;
347 case InstructionSet_POPCNT:
348 case InstructionSet_POPCNT_X64:
349 genPOPCNTIntrinsic(node);
350 break;
351 default:
352 unreached();
353 break;
354 }
355}
356
357//------------------------------------------------------------------------
358// genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
359// register/memory operand and that returns a value in register
360//
361// Arguments:
362// node - The hardware intrinsic node
363// ins - The instruction being generated
364// attr - The emit attribute for the instruciton being generated
365//
366void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
367{
368 var_types targetType = node->TypeGet();
369 regNumber targetReg = node->gtRegNum;
370 GenTree* op1 = node->gtGetOp1();
371 GenTree* op2 = node->gtGetOp2();
372 emitter* emit = getEmitter();
373
374 if (op2 != nullptr)
375 {
376 // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
377 // code path. They are all MultiIns, as the return value comes from the flags and
378 // we have two operands instead.
379
380 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
381 assert(targetReg != REG_NA);
382
383 targetReg = op1->gtRegNum;
384 op1 = op2;
385 op2 = nullptr;
386 }
387 else
388 {
389 assert(!node->OperIsCommutative());
390 }
391
392 assert(targetReg != REG_NA);
393 assert(op2 == nullptr);
394
395 if (op1->isContained() || op1->isUsedFromSpillTemp())
396 {
397 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
398 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
399
400 TempDsc* tmpDsc = nullptr;
401 unsigned varNum = BAD_VAR_NUM;
402 unsigned offset = (unsigned)-1;
403
404 if (op1->isUsedFromSpillTemp())
405 {
406 assert(op1->IsRegOptional());
407
408 tmpDsc = getSpillTempDsc(op1);
409 varNum = tmpDsc->tdTempNum();
410 offset = 0;
411
412 regSet.tmpRlsTemp(tmpDsc);
413 }
414 else if (op1->OperIsHWIntrinsic())
415 {
416 emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
417 return;
418 }
419 else if (op1->isIndir())
420 {
421 GenTreeIndir* memIndir = op1->AsIndir();
422 GenTree* memBase = memIndir->gtOp1;
423
424 switch (memBase->OperGet())
425 {
426 case GT_LCL_VAR_ADDR:
427 {
428 varNum = memBase->AsLclVarCommon()->GetLclNum();
429 offset = 0;
430
431 // Ensure that all the GenTreeIndir values are set to their defaults.
432 assert(!memIndir->HasIndex());
433 assert(memIndir->Scale() == 1);
434 assert(memIndir->Offset() == 0);
435
436 break;
437 }
438
439 case GT_CLS_VAR_ADDR:
440 {
441 emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
442 return;
443 }
444
445 default:
446 {
447 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
448 return;
449 }
450 }
451 }
452 else
453 {
454 switch (op1->OperGet())
455 {
456 case GT_LCL_FLD:
457 {
458 GenTreeLclFld* lclField = op1->AsLclFld();
459
460 varNum = lclField->GetLclNum();
461 offset = lclField->gtLclFld.gtLclOffs;
462 break;
463 }
464
465 case GT_LCL_VAR:
466 {
467 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
468 varNum = op1->AsLclVar()->GetLclNum();
469 offset = 0;
470 break;
471 }
472
473 default:
474 {
475 unreached();
476 break;
477 }
478 }
479 }
480
481 // Ensure we got a good varNum and offset.
482 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
483 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
484 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
485 assert(offset != (unsigned)-1);
486
487 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
488 }
489 else
490 {
491 regNumber op1Reg = op1->gtRegNum;
492 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
493 }
494}
495
496//------------------------------------------------------------------------
497// genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
498// an immediate operand, and that returns a value in register
499//
500// Arguments:
501// node - The hardware intrinsic node
502// ins - The instruction being generated
503// ival - The immediate value
504//
505void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
506{
507 var_types targetType = node->TypeGet();
508 regNumber targetReg = node->gtRegNum;
509 GenTree* op1 = node->gtGetOp1();
510 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
511 emitter* emit = getEmitter();
512
513 // TODO-XArch-CQ: Commutative operations can have op1 be contained
514 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
515
516 assert(targetReg != REG_NA);
517 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
518
519 if (op1->isContained() || op1->isUsedFromSpillTemp())
520 {
521 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
522 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
523
524 TempDsc* tmpDsc = nullptr;
525 unsigned varNum = BAD_VAR_NUM;
526 unsigned offset = (unsigned)-1;
527
528 if (op1->isUsedFromSpillTemp())
529 {
530 assert(op1->IsRegOptional());
531
532 tmpDsc = getSpillTempDsc(op1);
533 varNum = tmpDsc->tdTempNum();
534 offset = 0;
535
536 regSet.tmpRlsTemp(tmpDsc);
537 }
538 else if (op1->OperIsHWIntrinsic())
539 {
540 emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival);
541 return;
542 }
543 else if (op1->isIndir())
544 {
545 GenTreeIndir* memIndir = op1->AsIndir();
546 GenTree* memBase = memIndir->gtOp1;
547
548 switch (memBase->OperGet())
549 {
550 case GT_LCL_VAR_ADDR:
551 {
552 varNum = memBase->AsLclVarCommon()->GetLclNum();
553 offset = 0;
554
555 // Ensure that all the GenTreeIndir values are set to their defaults.
556 assert(!memIndir->HasIndex());
557 assert(memIndir->Scale() == 1);
558 assert(memIndir->Offset() == 0);
559
560 break;
561 }
562
563 case GT_CLS_VAR_ADDR:
564 {
565 emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
566 return;
567 }
568
569 default:
570 {
571 emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival);
572 return;
573 }
574 }
575 }
576 else
577 {
578 switch (op1->OperGet())
579 {
580 case GT_LCL_FLD:
581 {
582 GenTreeLclFld* lclField = op1->AsLclFld();
583
584 varNum = lclField->GetLclNum();
585 offset = lclField->gtLclFld.gtLclOffs;
586 break;
587 }
588
589 case GT_LCL_VAR:
590 {
591 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
592 varNum = op1->AsLclVar()->GetLclNum();
593 offset = 0;
594 break;
595 }
596
597 default:
598 unreached();
599 break;
600 }
601 }
602
603 // Ensure we got a good varNum and offset.
604 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
605 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
606 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
607 assert(offset != (unsigned)-1);
608
609 emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival);
610 }
611 else
612 {
613 regNumber op1Reg = op1->gtRegNum;
614 emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
615 }
616}
617
618//------------------------------------------------------------------------
619// genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
620// register/memory operand, and that returns a value in register
621//
622// Arguments:
623// node - The hardware intrinsic node
624// ins - The instruction being generated
625// attr - The emit attribute for the instruciton being generated
626//
627void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
628{
629 regNumber targetReg = node->gtRegNum;
630 GenTree* op1 = node->gtGetOp1();
631 GenTree* op2 = node->gtGetOp2();
632 regNumber op1Reg = op1->gtRegNum;
633
634 assert(targetReg != REG_NA);
635 assert(op1Reg != REG_NA);
636
637 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
638}
639
640//------------------------------------------------------------------------
641// genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
642// register/memory operand, and that returns a value in register
643//
644// Arguments:
645// node - The hardware intrinsic node
646// ins - The instruction being generated
647// attr - The emit attribute for the instruciton being generated
648// targetReg - The register allocated to the result
649// op1Reg - The register allocated to the first operand
650// op2 - Another operand that maybe in register or memory
651//
652void CodeGen::genHWIntrinsic_R_R_RM(
653 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
654{
655 emitter* emit = getEmitter();
656
657 // TODO-XArch-CQ: Commutative operations can have op1 be contained
658 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
659
660 assert(targetReg != REG_NA);
661 assert(op1Reg != REG_NA);
662
663 if (op2->isContained() || op2->isUsedFromSpillTemp())
664 {
665 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
666 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
667
668 TempDsc* tmpDsc = nullptr;
669 unsigned varNum = BAD_VAR_NUM;
670 unsigned offset = (unsigned)-1;
671
672 if (op2->isUsedFromSpillTemp())
673 {
674 assert(op2->IsRegOptional());
675
676 tmpDsc = getSpillTempDsc(op2);
677 varNum = tmpDsc->tdTempNum();
678 offset = 0;
679
680 regSet.tmpRlsTemp(tmpDsc);
681 }
682 else if (op2->OperIsHWIntrinsic())
683 {
684 emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
685 return;
686 }
687 else if (op2->isIndir())
688 {
689 GenTreeIndir* memIndir = op2->AsIndir();
690 GenTree* memBase = memIndir->gtOp1;
691
692 switch (memBase->OperGet())
693 {
694 case GT_LCL_VAR_ADDR:
695 {
696 varNum = memBase->AsLclVarCommon()->GetLclNum();
697 offset = 0;
698
699 // Ensure that all the GenTreeIndir values are set to their defaults.
700 assert(!memIndir->HasIndex());
701 assert(memIndir->Scale() == 1);
702 assert(memIndir->Offset() == 0);
703
704 break;
705 }
706
707 case GT_CLS_VAR_ADDR:
708 {
709 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
710 return;
711 }
712
713 default:
714 {
715 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
716 return;
717 }
718 }
719 }
720 else
721 {
722 switch (op2->OperGet())
723 {
724 case GT_LCL_FLD:
725 {
726 GenTreeLclFld* lclField = op2->AsLclFld();
727
728 varNum = lclField->GetLclNum();
729 offset = lclField->gtLclFld.gtLclOffs;
730 break;
731 }
732
733 case GT_LCL_VAR:
734 {
735 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
736 varNum = op2->AsLclVar()->GetLclNum();
737 offset = 0;
738 break;
739 }
740
741 default:
742 unreached();
743 break;
744 }
745 }
746
747 // Ensure we got a good varNum and offset.
748 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
749 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
750 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
751 assert(offset != (unsigned)-1);
752
753 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
754 }
755 else
756 {
757 regNumber op2Reg = op2->gtRegNum;
758
759 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
760 {
761 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
762 //
763 // For non-commutative intrinsics, we should have ensured that op2 was marked
764 // delay free in order to prevent it from getting assigned the same register
765 // as target. However, for commutative intrinsics, we can just swap the operands
766 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
767
768 noway_assert(node->OperIsCommutative());
769 op2Reg = op1Reg;
770 op1Reg = targetReg;
771 }
772
773 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
774 }
775}
776
777//------------------------------------------------------------------------
778// genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
779// register/memory operand, an immediate operand, and that returns a value in register
780//
781// Arguments:
782// node - The hardware intrinsic node
783// ins - The instruction being generated
784// ival - The immediate value
785//
786void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
787{
788 var_types targetType = node->TypeGet();
789 regNumber targetReg = node->gtRegNum;
790 GenTree* op1 = node->gtGetOp1();
791 GenTree* op2 = node->gtGetOp2();
792 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
793 emitter* emit = getEmitter();
794
795 // TODO-XArch-CQ: Commutative operations can have op1 be contained
796 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
797
798 if (op1->OperIsList())
799 {
800 assert(op2 == nullptr);
801
802 GenTreeArgList* argList = op1->AsArgList();
803
804 op1 = argList->Current();
805 argList = argList->Rest();
806
807 op2 = argList->Current();
808 argList = argList->Rest();
809
810 assert(argList->Current() != nullptr);
811 assert(argList->Rest() == nullptr);
812 }
813
814 regNumber op1Reg = op1->gtRegNum;
815
816 assert(targetReg != REG_NA);
817 assert(op1Reg != REG_NA);
818
819 if (op2->isContained() || op2->isUsedFromSpillTemp())
820 {
821 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
822 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
823
824 TempDsc* tmpDsc = nullptr;
825 unsigned varNum = BAD_VAR_NUM;
826 unsigned offset = (unsigned)-1;
827
828 if (op2->isUsedFromSpillTemp())
829 {
830 assert(op2->IsRegOptional());
831
832 tmpDsc = getSpillTempDsc(op2);
833 varNum = tmpDsc->tdTempNum();
834 offset = 0;
835
836 regSet.tmpRlsTemp(tmpDsc);
837 }
838 else if (op2->OperIsHWIntrinsic())
839 {
840 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
841 return;
842 }
843 else if (op2->isIndir())
844 {
845 GenTreeIndir* memIndir = op2->AsIndir();
846 GenTree* memBase = memIndir->gtOp1;
847
848 switch (memBase->OperGet())
849 {
850 case GT_LCL_VAR_ADDR:
851 {
852 varNum = memBase->AsLclVarCommon()->GetLclNum();
853 offset = 0;
854
855 // Ensure that all the GenTreeIndir values are set to their defaults.
856 assert(!memIndir->HasIndex());
857 assert(memIndir->Scale() == 1);
858 assert(memIndir->Offset() == 0);
859
860 break;
861 }
862
863 case GT_CLS_VAR_ADDR:
864 {
865 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
866 ival);
867 return;
868 }
869
870 default:
871 {
872 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
873 return;
874 }
875 }
876 }
877 else
878 {
879 switch (op2->OperGet())
880 {
881 case GT_LCL_FLD:
882 {
883 GenTreeLclFld* lclField = op2->AsLclFld();
884
885 varNum = lclField->GetLclNum();
886 offset = lclField->gtLclFld.gtLclOffs;
887 break;
888 }
889
890 case GT_LCL_VAR:
891 {
892 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
893 varNum = op2->AsLclVar()->GetLclNum();
894 offset = 0;
895 break;
896 }
897
898 default:
899 unreached();
900 break;
901 }
902 }
903
904 // Ensure we got a good varNum and offset.
905 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
906 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
907 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
908 assert(offset != (unsigned)-1);
909
910 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
911 }
912 else
913 {
914 regNumber op2Reg = op2->gtRegNum;
915
916 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
917 {
918 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
919 //
920 // For non-commutative intrinsics, we should have ensured that op2 was marked
921 // delay free in order to prevent it from getting assigned the same register
922 // as target. However, for commutative intrinsics, we can just swap the operands
923 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
924
925 noway_assert(node->OperIsCommutative());
926 op2Reg = op1Reg;
927 op1Reg = targetReg;
928 }
929
930 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
931 }
932}
933
934//------------------------------------------------------------------------
935// genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
936// register/memory operand, another register operand, and that returns a value in register
937//
938// Arguments:
939// node - The hardware intrinsic node
940// ins - The instruction being generated
941//
942void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
943{
944 var_types targetType = node->TypeGet();
945 regNumber targetReg = node->gtRegNum;
946 GenTree* op1 = node->gtGetOp1();
947 GenTree* op2 = node->gtGetOp2();
948 GenTree* op3 = nullptr;
949 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
950 emitter* emit = getEmitter();
951
952 assert(op1->OperIsList());
953 assert(op2 == nullptr);
954
955 GenTreeArgList* argList = op1->AsArgList();
956
957 op1 = argList->Current();
958 argList = argList->Rest();
959
960 op2 = argList->Current();
961 argList = argList->Rest();
962
963 op3 = argList->Current();
964 assert(argList->Rest() == nullptr);
965
966 regNumber op1Reg = op1->gtRegNum;
967 regNumber op3Reg = op3->gtRegNum;
968
969 assert(targetReg != REG_NA);
970 assert(op1Reg != REG_NA);
971 assert(op3Reg != REG_NA);
972
973 if (op2->isContained() || op2->isUsedFromSpillTemp())
974 {
975 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
976 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
977
978 TempDsc* tmpDsc = nullptr;
979 unsigned varNum = BAD_VAR_NUM;
980 unsigned offset = (unsigned)-1;
981
982 if (op2->isUsedFromSpillTemp())
983 {
984 assert(op2->IsRegOptional());
985
986 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
987 // pattern. It could probably be extracted to its own method.
988 tmpDsc = getSpillTempDsc(op2);
989 varNum = tmpDsc->tdTempNum();
990 offset = 0;
991
992 regSet.tmpRlsTemp(tmpDsc);
993 }
994 else if (op2->OperIsHWIntrinsic())
995 {
996 emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
997 return;
998 }
999 else if (op2->isIndir())
1000 {
1001 GenTreeIndir* memIndir = op2->AsIndir();
1002 GenTree* memBase = memIndir->gtOp1;
1003
1004 switch (memBase->OperGet())
1005 {
1006 case GT_LCL_VAR_ADDR:
1007 {
1008 varNum = memBase->AsLclVarCommon()->GetLclNum();
1009 offset = 0;
1010
1011 // Ensure that all the GenTreeIndir values are set to their defaults.
1012 assert(!memIndir->HasIndex());
1013 assert(memIndir->Scale() == 1);
1014 assert(memIndir->Offset() == 0);
1015
1016 break;
1017 }
1018
1019 case GT_CLS_VAR_ADDR:
1020 {
1021 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
1022 0);
1023 return;
1024 }
1025
1026 default:
1027 {
1028 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1029 return;
1030 }
1031 }
1032 }
1033 else
1034 {
1035 switch (op2->OperGet())
1036 {
1037 case GT_LCL_FLD:
1038 {
1039 GenTreeLclFld* lclField = op2->AsLclFld();
1040
1041 varNum = lclField->GetLclNum();
1042 offset = lclField->gtLclFld.gtLclOffs;
1043 break;
1044 }
1045
1046 case GT_LCL_VAR:
1047 {
1048 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1049 varNum = op2->AsLclVar()->GetLclNum();
1050 offset = 0;
1051 break;
1052 }
1053
1054 default:
1055 unreached();
1056 break;
1057 }
1058 }
1059
1060 // Ensure we got a good varNum and offset.
1061 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1062 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1063 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1064 assert(offset != (unsigned)-1);
1065
1066 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1067 }
1068 else
1069 {
1070 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1071 }
1072}
1073
1074//------------------------------------------------------------------------
1075// genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1076// a register/memory operand, and that returns a value in register
1077//
1078// Arguments:
1079// ins - The instruction being generated
1080// attr - The emit attribute
1081// targetReg - The target register
1082// op1Reg - The register of the first operand
1083// op2Reg - The register of the second operand
1084// op3 - The third operand
1085//
1086void CodeGen::genHWIntrinsic_R_R_R_RM(
1087 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1088{
1089 assert(targetReg != REG_NA);
1090 assert(op1Reg != REG_NA);
1091 assert(op2Reg != REG_NA);
1092
1093 emitter* emit = getEmitter();
1094
1095 if (op3->isContained() || op3->isUsedFromSpillTemp())
1096 {
1097 TempDsc* tmpDsc = nullptr;
1098 unsigned varNum = BAD_VAR_NUM;
1099 unsigned offset = (unsigned)-1;
1100
1101 if (op3->isUsedFromSpillTemp())
1102 {
1103 assert(op3->IsRegOptional());
1104
1105 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1106 // pattern. It could probably be extracted to its own method.
1107 tmpDsc = getSpillTempDsc(op3);
1108 varNum = tmpDsc->tdTempNum();
1109 offset = 0;
1110
1111 regSet.tmpRlsTemp(tmpDsc);
1112 }
1113 else if (op3->OperIsHWIntrinsic())
1114 {
1115 emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1116 return;
1117 }
1118 else if (op3->isIndir())
1119 {
1120 GenTreeIndir* memIndir = op3->AsIndir();
1121 GenTree* memBase = memIndir->gtOp1;
1122
1123 switch (memBase->OperGet())
1124 {
1125 case GT_LCL_VAR_ADDR:
1126 {
1127 varNum = memBase->AsLclVarCommon()->GetLclNum();
1128 offset = 0;
1129
1130 // Ensure that all the GenTreeIndir values are set to their defaults.
1131 assert(!memIndir->HasIndex());
1132 assert(memIndir->Scale() == 1);
1133 assert(memIndir->Offset() == 0);
1134
1135 break;
1136 }
1137
1138 case GT_CLS_VAR_ADDR:
1139 {
1140 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1141 return;
1142 }
1143
1144 default:
1145 {
1146 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1147 return;
1148 }
1149 }
1150 }
1151 else
1152 {
1153 switch (op3->OperGet())
1154 {
1155 case GT_LCL_FLD:
1156 {
1157 GenTreeLclFld* lclField = op3->AsLclFld();
1158
1159 varNum = lclField->GetLclNum();
1160 offset = lclField->gtLclFld.gtLclOffs;
1161 break;
1162 }
1163
1164 case GT_LCL_VAR:
1165 {
1166 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1167 varNum = op3->AsLclVar()->GetLclNum();
1168 offset = 0;
1169 break;
1170 }
1171
1172 default:
1173 unreached();
1174 break;
1175 }
1176 }
1177
1178 // Ensure we got a good varNum and offset.
1179 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1180 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1181 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1182 assert(offset != (unsigned)-1);
1183
1184 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1185 }
1186 else
1187 {
1188 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1189 }
1190}
1191
1192// genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1193// with non-constant argument
1194//
1195// Arguments:
1196// intrinsic - intrinsic ID
1197// nonConstImmReg - the register contains non-constant imm8 argument
1198// baseReg - a register for the start of the switch table
1199// offsReg - a register for the offset into the switch table
1200// emitSwCase - the lambda to generate siwtch-case
1201//
1202// Return Value:
1203// generate the jump-table fallback for imm-intrinsics with non-constant argument.
1204// Note:
1205// This function can be used for all imm-intrinsics (whether full-range or not),
1206// The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1207// (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1208//
1209template <typename HWIntrinsicSwitchCaseBody>
1210void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1211 regNumber nonConstImmReg,
1212 regNumber baseReg,
1213 regNumber offsReg,
1214 HWIntrinsicSwitchCaseBody emitSwCase)
1215{
1216 assert(nonConstImmReg != REG_NA);
1217 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1218 // that does work with the current compiler generated jump-table fallback
1219 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1220 emitter* emit = getEmitter();
1221
1222 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1223 assert(maxByte <= 256);
1224 BasicBlock* jmpTable[256];
1225
1226 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1227 unsigned jmpTableOffs = 0;
1228
1229 // Emit the jump table
1230 for (unsigned i = 0; i < maxByte; i++)
1231 {
1232 jmpTable[i] = genCreateTempLabel();
1233 emit->emitDataGenData(i, jmpTable[i]);
1234 }
1235
1236 emit->emitDataGenEnd();
1237
1238 // Compute and jump to the appropriate offset in the switch table
1239 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1240
1241 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1242 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1243 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1244 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1245
1246 // Emit the switch table entries
1247
1248 BasicBlock* switchTableBeg = genCreateTempLabel();
1249 BasicBlock* switchTableEnd = genCreateTempLabel();
1250
1251 genDefineTempLabel(switchTableBeg);
1252
1253 for (unsigned i = 0; i < maxByte; i++)
1254 {
1255 genDefineTempLabel(jmpTable[i]);
1256 emitSwCase((int8_t)i);
1257 emit->emitIns_J(INS_jmp, switchTableEnd);
1258 }
1259
1260 genDefineTempLabel(switchTableEnd);
1261}
1262
1263//------------------------------------------------------------------------
1264// genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1265//
1266// Arguments:
1267// node - The hardware intrinsic node
1268//
1269// Note:
1270// We currently assume that all base intrinsics only have a single operand.
1271//
1272void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1273{
1274 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1275 regNumber targetReg = node->gtRegNum;
1276 var_types targetType = node->TypeGet();
1277 var_types baseType = node->gtSIMDBaseType;
1278
1279 assert(compiler->compSupports(InstructionSet_SSE));
1280 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1281
1282 GenTree* op1 = node->gtGetOp1();
1283 regNumber op1Reg = REG_NA;
1284
1285 if (op1 != nullptr)
1286 {
1287 assert(!op1->OperIsList());
1288 op1Reg = op1->gtRegNum;
1289 genConsumeOperands(node);
1290 }
1291
1292 assert(node->gtGetOp2() == nullptr);
1293
1294 emitter* emit = getEmitter();
1295 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1296 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1297
1298 switch (intrinsicId)
1299 {
1300 case NI_Base_Vector128_CreateScalarUnsafe:
1301 case NI_Base_Vector256_CreateScalarUnsafe:
1302 {
1303 if (varTypeIsIntegral(baseType))
1304 {
1305 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1306 }
1307 else
1308 {
1309 assert(varTypeIsFloating(baseType));
1310
1311 attr = emitTypeSize(baseType);
1312
1313 if (op1->isContained() || op1->isUsedFromSpillTemp())
1314 {
1315 genHWIntrinsic_R_RM(node, ins, attr);
1316 }
1317 else if (targetReg != op1Reg)
1318 {
1319 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1320 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1321 }
1322 }
1323 break;
1324 }
1325
1326 case NI_Base_Vector128_ToScalar:
1327 case NI_Base_Vector256_ToScalar:
1328 {
1329 assert(varTypeIsFloating(baseType));
1330
1331 attr = emitTypeSize(TYP_SIMD16);
1332
1333 if (op1->isContained() || op1->isUsedFromSpillTemp())
1334 {
1335 genHWIntrinsic_R_RM(node, ins, attr);
1336 }
1337 else if (targetReg != op1Reg)
1338 {
1339 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1340 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1341 }
1342 break;
1343 }
1344
1345 case NI_Base_Vector128_ToVector256:
1346 {
1347 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1348 // We always emit a move to the target register, even when op1Reg == targetReg,
1349 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1350
1351 attr = emitTypeSize(TYP_SIMD16);
1352
1353 if (op1->isContained() || op1->isUsedFromSpillTemp())
1354 {
1355 genHWIntrinsic_R_RM(node, ins, attr);
1356 }
1357 else
1358 {
1359 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1360 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1361 }
1362 break;
1363 }
1364
1365 case NI_Base_Vector128_ToVector256Unsafe:
1366 case NI_Base_Vector256_GetLower:
1367 {
1368 if (op1->isContained() || op1->isUsedFromSpillTemp())
1369 {
1370 genHWIntrinsic_R_RM(node, ins, attr);
1371 }
1372 else if (targetReg != op1Reg)
1373 {
1374 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1375 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1376 }
1377 break;
1378 }
1379
1380 case NI_Base_Vector128_Zero:
1381 case NI_Base_Vector256_Zero:
1382 {
1383 assert(op1 == nullptr);
1384 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1385 break;
1386 }
1387
1388 default:
1389 {
1390 unreached();
1391 break;
1392 }
1393 }
1394
1395 genProduceReg(node);
1396}
1397
1398//------------------------------------------------------------------------
1399// genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1400//
1401// Arguments:
1402// node - The hardware intrinsic node
1403//
1404void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1405{
1406 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1407 GenTree* op1 = node->gtGetOp1();
1408 GenTree* op2 = node->gtGetOp2();
1409 GenTree* op3 = nullptr;
1410 GenTree* op4 = nullptr;
1411 regNumber targetReg = node->gtRegNum;
1412 var_types targetType = node->TypeGet();
1413 var_types baseType = node->gtSIMDBaseType;
1414
1415 regNumber op1Reg = REG_NA;
1416 regNumber op2Reg = REG_NA;
1417 regNumber op3Reg = REG_NA;
1418 regNumber op4Reg = REG_NA;
1419 emitter* emit = getEmitter();
1420
1421 if ((op1 != nullptr) && !op1->OperIsList())
1422 {
1423 op1Reg = op1->gtRegNum;
1424 genConsumeOperands(node);
1425 }
1426
1427 switch (intrinsicId)
1428 {
1429 case NI_SSE_CompareEqualOrderedScalar:
1430 case NI_SSE_CompareEqualUnorderedScalar:
1431 {
1432 assert(baseType == TYP_FLOAT);
1433 regNumber tmpReg = node->GetSingleTempReg();
1434 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1435
1436 // Ensure we aren't overwriting targetReg
1437 assert(tmpReg != targetReg);
1438
1439 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1440 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
1441 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1442 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1443 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1444 break;
1445 }
1446
1447 case NI_SSE_CompareGreaterThanOrderedScalar:
1448 case NI_SSE_CompareGreaterThanUnorderedScalar:
1449 {
1450 assert(baseType == TYP_FLOAT);
1451 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1452
1453 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1454 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1455 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1456 break;
1457 }
1458
1459 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1460 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1461 {
1462 assert(baseType == TYP_FLOAT);
1463 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1464
1465 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1466 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1467 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1468 break;
1469 }
1470
1471 case NI_SSE_CompareLessThanOrderedScalar:
1472 case NI_SSE_CompareLessThanUnorderedScalar:
1473 {
1474 assert(baseType == TYP_FLOAT);
1475 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1476
1477 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1478 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1479 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1480 break;
1481 }
1482
1483 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1484 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1485 {
1486 assert(baseType == TYP_FLOAT);
1487 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1488
1489 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1490 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1491 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1492 break;
1493 }
1494
1495 case NI_SSE_CompareNotEqualOrderedScalar:
1496 case NI_SSE_CompareNotEqualUnorderedScalar:
1497 {
1498 assert(baseType == TYP_FLOAT);
1499 regNumber tmpReg = node->GetSingleTempReg();
1500 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1501
1502 // Ensure we aren't overwriting targetReg
1503 assert(tmpReg != targetReg);
1504
1505 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1506 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
1507 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1508 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1509 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1510 break;
1511 }
1512
1513 case NI_SSE_X64_ConvertScalarToVector128Single:
1514 {
1515 assert(baseType == TYP_LONG);
1516 assert(op1 != nullptr);
1517 assert(op2 != nullptr);
1518 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1519 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1520 break;
1521 }
1522
1523 case NI_SSE_MoveMask:
1524 {
1525 assert(baseType == TYP_FLOAT);
1526 assert(op2 == nullptr);
1527
1528 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1529 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1530 break;
1531 }
1532
1533 case NI_SSE_Prefetch0:
1534 case NI_SSE_Prefetch1:
1535 case NI_SSE_Prefetch2:
1536 case NI_SSE_PrefetchNonTemporal:
1537 {
1538 assert(baseType == TYP_UBYTE);
1539 assert(op2 == nullptr);
1540
1541 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1542 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1543 break;
1544 }
1545
1546 case NI_SSE_StoreFence:
1547 {
1548 assert(baseType == TYP_VOID);
1549 assert(op1 == nullptr);
1550 assert(op2 == nullptr);
1551 emit->emitIns(INS_sfence);
1552 break;
1553 }
1554
1555 default:
1556 unreached();
1557 break;
1558 }
1559
1560 genProduceReg(node);
1561}
1562
1563//------------------------------------------------------------------------
1564// genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1565//
1566// Arguments:
1567// node - The hardware intrinsic node
1568//
1569void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1570{
1571 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1572 GenTree* op1 = node->gtGetOp1();
1573 GenTree* op2 = node->gtGetOp2();
1574 regNumber targetReg = node->gtRegNum;
1575 var_types targetType = node->TypeGet();
1576 var_types baseType = node->gtSIMDBaseType;
1577 regNumber op1Reg = REG_NA;
1578 regNumber op2Reg = REG_NA;
1579 emitter* emit = getEmitter();
1580
1581 if ((op1 != nullptr) && !op1->OperIsList())
1582 {
1583 op1Reg = op1->gtRegNum;
1584 genConsumeOperands(node);
1585 }
1586
1587 switch (intrinsicId)
1588 {
1589 // All integer overloads are handled by table codegen
1590 case NI_SSE2_CompareLessThan:
1591 {
1592 assert(op1 != nullptr);
1593 assert(op2 != nullptr);
1594
1595 assert(baseType == TYP_DOUBLE);
1596
1597 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1598 assert((ival >= 0) && (ival <= 127));
1599
1600 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1601 op2Reg = op2->gtRegNum;
1602 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1603
1604 break;
1605 }
1606
1607 case NI_SSE2_CompareEqualOrderedScalar:
1608 case NI_SSE2_CompareEqualUnorderedScalar:
1609 {
1610 assert(baseType == TYP_DOUBLE);
1611 regNumber tmpReg = node->GetSingleTempReg();
1612 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1613
1614 // Ensure we aren't overwriting targetReg
1615 assert(tmpReg != targetReg);
1616
1617 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1618 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
1619 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1620 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1621 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1622 break;
1623 }
1624
1625 case NI_SSE2_CompareGreaterThanOrderedScalar:
1626 case NI_SSE2_CompareGreaterThanUnorderedScalar:
1627 {
1628 assert(baseType == TYP_DOUBLE);
1629 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1630
1631 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1632 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1633 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1634 break;
1635 }
1636
1637 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1638 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1639 {
1640 assert(baseType == TYP_DOUBLE);
1641 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1642
1643 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1644 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1645 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1646 break;
1647 }
1648
1649 case NI_SSE2_CompareLessThanOrderedScalar:
1650 case NI_SSE2_CompareLessThanUnorderedScalar:
1651 {
1652 assert(baseType == TYP_DOUBLE);
1653 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1654
1655 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1656 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1657 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1658 break;
1659 }
1660
1661 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1662 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1663 {
1664 assert(baseType == TYP_DOUBLE);
1665 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1666
1667 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1668 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1669 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1670 break;
1671 }
1672
1673 case NI_SSE2_CompareNotEqualOrderedScalar:
1674 case NI_SSE2_CompareNotEqualUnorderedScalar:
1675 {
1676 assert(baseType == TYP_DOUBLE);
1677 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1678 regNumber tmpReg = node->GetSingleTempReg();
1679
1680 // Ensure we aren't overwriting targetReg
1681 assert(tmpReg != targetReg);
1682
1683 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1684 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
1685 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1686 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1687 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1688 break;
1689 }
1690
1691 case NI_SSE2_X64_ConvertScalarToVector128Double:
1692 {
1693 assert(baseType == TYP_LONG);
1694 assert(op1 != nullptr);
1695 assert(op2 != nullptr);
1696 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1697 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1698 break;
1699 }
1700
1701 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1702 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1703 {
1704 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1705 assert(op1 != nullptr);
1706 assert(op2 == nullptr);
1707 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1708 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1709 break;
1710 }
1711
1712 case NI_SSE2_ConvertToInt32:
1713 case NI_SSE2_ConvertToInt32WithTruncation:
1714 case NI_SSE2_ConvertToUInt32:
1715 case NI_SSE2_X64_ConvertToUInt64:
1716 case NI_SSE2_X64_ConvertToInt64:
1717 {
1718 assert(op2 == nullptr);
1719 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1720
1721 if (varTypeIsIntegral(baseType))
1722 {
1723 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1724 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1725 }
1726 else
1727 {
1728 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1729 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1730 }
1731 break;
1732 }
1733
1734 case NI_SSE2_LoadFence:
1735 {
1736 assert(baseType == TYP_VOID);
1737 assert(op1 == nullptr);
1738 assert(op2 == nullptr);
1739 emit->emitIns(INS_lfence);
1740 break;
1741 }
1742
1743 case NI_SSE2_MemoryFence:
1744 {
1745 assert(baseType == TYP_VOID);
1746 assert(op1 == nullptr);
1747 assert(op2 == nullptr);
1748 emit->emitIns(INS_mfence);
1749 break;
1750 }
1751
1752 case NI_SSE2_MoveMask:
1753 {
1754 assert(op2 == nullptr);
1755 assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1756
1757 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1758 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1759 break;
1760 }
1761
1762 case NI_SSE2_StoreNonTemporal:
1763 case NI_SSE2_X64_StoreNonTemporal:
1764 {
1765 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1766 assert(op1 != nullptr);
1767 assert(op2 != nullptr);
1768
1769 op2Reg = op2->gtRegNum;
1770 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1771 emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1772 break;
1773 }
1774
1775 default:
1776 unreached();
1777 break;
1778 }
1779
1780 genProduceReg(node);
1781}
1782
1783//------------------------------------------------------------------------
1784// genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1785//
1786// Arguments:
1787// node - The hardware intrinsic node
1788//
1789void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1790{
1791 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1792 GenTree* op1 = node->gtGetOp1();
1793 GenTree* op2 = node->gtGetOp2();
1794 GenTree* op3 = nullptr;
1795 GenTree* op4 = nullptr;
1796 regNumber targetReg = node->gtRegNum;
1797 var_types targetType = node->TypeGet();
1798 var_types baseType = node->gtSIMDBaseType;
1799
1800 regNumber op1Reg = REG_NA;
1801 regNumber op2Reg = REG_NA;
1802 regNumber op3Reg = REG_NA;
1803 regNumber op4Reg = REG_NA;
1804 emitter* emit = getEmitter();
1805
1806 if ((op1 != nullptr) && !op1->OperIsList())
1807 {
1808 op1Reg = op1->gtRegNum;
1809 genConsumeOperands(node);
1810 }
1811
1812 switch (intrinsicId)
1813 {
1814 case NI_SSE41_TestAllOnes:
1815 {
1816 regNumber tmpReg = node->GetSingleTempReg();
1817 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1818 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1819 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1820 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1821 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1822 break;
1823 }
1824
1825 case NI_SSE41_TestAllZeros:
1826 case NI_SSE41_TestZ:
1827 {
1828 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1829 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1830 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1831 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1832 break;
1833 }
1834
1835 case NI_SSE41_TestC:
1836 {
1837 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1838 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1839 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1840 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1841 break;
1842 }
1843
1844 case NI_SSE41_TestMixOnesZeros:
1845 case NI_SSE41_TestNotZAndNotC:
1846 {
1847 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1848 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1849 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1850 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1851 break;
1852 }
1853
1854 case NI_SSE41_Extract:
1855 case NI_SSE41_X64_Extract:
1856 {
1857 regNumber tmpTargetReg = REG_NA;
1858 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1859 if (baseType == TYP_FLOAT)
1860 {
1861 tmpTargetReg = node->ExtractTempReg();
1862 }
1863
1864 auto emitSwCase = [&](int8_t i) {
1865 if (baseType == TYP_FLOAT)
1866 {
1867 // extract instructions return to GP-registers, so it needs int size as the emitsize
1868 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i);
1869 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1870 }
1871 else
1872 {
1873 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i);
1874 }
1875 };
1876
1877 if (op2->IsCnsIntOrI())
1878 {
1879 ssize_t ival = op2->AsIntCon()->IconValue();
1880 assert((ival >= 0) && (ival <= 255));
1881 emitSwCase((int8_t)ival);
1882 }
1883 else
1884 {
1885 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1886 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1887 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1888 regNumber baseReg = node->ExtractTempReg();
1889 regNumber offsReg = node->GetSingleTempReg();
1890 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1891 }
1892 break;
1893 }
1894
1895 default:
1896 unreached();
1897 break;
1898 }
1899
1900 genProduceReg(node);
1901}
1902
1903//------------------------------------------------------------------------
1904// genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1905//
1906// Arguments:
1907// node - The hardware intrinsic node
1908//
1909void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1910{
1911 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1912 regNumber targetReg = node->gtRegNum;
1913 GenTree* op1 = node->gtGetOp1();
1914 GenTree* op2 = node->gtGetOp2();
1915 var_types baseType = node->gtSIMDBaseType;
1916 var_types targetType = node->TypeGet();
1917 emitter* emit = getEmitter();
1918
1919 regNumber op1Reg = op1->gtRegNum;
1920 genConsumeOperands(node);
1921
1922 assert(targetReg != REG_NA);
1923 assert(op1Reg != REG_NA);
1924 assert(op2 != nullptr);
1925 assert(!node->OperIsCommutative());
1926
1927 switch (intrinsicId)
1928 {
1929 case NI_SSE42_Crc32:
1930 case NI_SSE42_X64_Crc32:
1931 {
1932 if (op1Reg != targetReg)
1933 {
1934 assert(op2->gtRegNum != targetReg);
1935 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1936 }
1937
1938 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1939 // overload that explicitly takes the operands.
1940 node->gtOp1 = op2;
1941 node->gtOp2 = nullptr;
1942
1943 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1944 {
1945 assert(targetType == TYP_INT);
1946 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1947 }
1948 else
1949 {
1950 assert(op1->TypeGet() == op2->TypeGet());
1951 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1952 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1953 }
1954
1955 break;
1956 }
1957
1958 default:
1959 {
1960 unreached();
1961 break;
1962 }
1963 }
1964
1965 genProduceReg(node);
1966}
1967
1968//------------------------------------------------------------------------
1969// genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1970//
1971// Arguments:
1972// node - The hardware intrinsic node
1973//
1974void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1975{
1976 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1977 var_types baseType = node->gtSIMDBaseType;
1978 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1979 var_types targetType = node->TypeGet();
1980 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1981 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1982 GenTree* op1 = node->gtGetOp1();
1983 GenTree* op2 = node->gtGetOp2();
1984 regNumber op1Reg = REG_NA;
1985 regNumber op2Reg = REG_NA;
1986 regNumber targetReg = node->gtRegNum;
1987 emitter* emit = getEmitter();
1988
1989 if ((op1 != nullptr) && !op1->OperIsList())
1990 {
1991 op1Reg = op1->gtRegNum;
1992 genConsumeOperands(node);
1993 }
1994
1995 switch (intrinsicId)
1996 {
1997 case NI_AVX2_ConvertToInt32:
1998 case NI_AVX2_ConvertToUInt32:
1999 {
2000 assert(op2 == nullptr);
2001 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
2002 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2003 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
2004 break;
2005 }
2006
2007 case NI_AVX2_GatherVector128:
2008 case NI_AVX2_GatherVector256:
2009 case NI_AVX2_GatherMaskVector128:
2010 case NI_AVX2_GatherMaskVector256:
2011 {
2012 GenTreeArgList* list = op1->AsArgList();
2013 op1 = list->Current();
2014 op1Reg = op1->gtRegNum;
2015 genConsumeRegs(op1);
2016
2017 list = list->Rest();
2018 op2 = list->Current();
2019 op2Reg = op2->gtRegNum;
2020 genConsumeRegs(op2);
2021
2022 list = list->Rest();
2023 GenTree* op3 = list->Current();
2024 genConsumeRegs(op3);
2025
2026 list = list->Rest();
2027 GenTree* op4 = nullptr;
2028 GenTree* lastOp = nullptr;
2029 GenTree* indexOp = nullptr;
2030
2031 regNumber op3Reg = REG_NA;
2032 regNumber op4Reg = REG_NA;
2033 regNumber addrBaseReg = REG_NA;
2034 regNumber addrIndexReg = REG_NA;
2035 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
2036
2037 if (numArgs == 5)
2038 {
2039 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2040 op4 = list->Current();
2041 list = list->Rest();
2042 lastOp = list->Current();
2043 op3Reg = op3->gtRegNum;
2044 op4Reg = op4->gtRegNum;
2045 genConsumeRegs(op4);
2046 addrBaseReg = op2Reg;
2047 addrIndexReg = op3Reg;
2048 indexOp = op3;
2049
2050 // copy op4Reg into the tmp mask register,
2051 // the mask register will be cleared by gather instructions
2052 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2053
2054 if (targetReg != op1Reg)
2055 {
2056 // copy source vector to the target register for masking merge
2057 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2058 }
2059 }
2060 else
2061 {
2062 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2063 addrBaseReg = op1Reg;
2064 addrIndexReg = op2Reg;
2065 indexOp = op2;
2066 lastOp = op3;
2067
2068 // generate all-one mask vector
2069 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2070 }
2071
2072 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2073
2074 // hwintrinsiclistxarch.h uses Dword index instructions in default
2075 if (varTypeIsLong(node->gtIndexBaseType))
2076 {
2077 switch (ins)
2078 {
2079 case INS_vpgatherdd:
2080 ins = INS_vpgatherqd;
2081 if (isVector128GatherWithVector256Index)
2082 {
2083 // YMM index in address mode
2084 attr = emitTypeSize(TYP_SIMD32);
2085 }
2086 break;
2087 case INS_vpgatherdq:
2088 ins = INS_vpgatherqq;
2089 break;
2090 case INS_vgatherdps:
2091 ins = INS_vgatherqps;
2092 if (isVector128GatherWithVector256Index)
2093 {
2094 // YMM index in address mode
2095 attr = emitTypeSize(TYP_SIMD32);
2096 }
2097 break;
2098 case INS_vgatherdpd:
2099 ins = INS_vgatherqpd;
2100 break;
2101 default:
2102 unreached();
2103 }
2104 }
2105
2106 assert(lastOp->IsCnsIntOrI());
2107 ssize_t ival = lastOp->AsIntCon()->IconValue();
2108 assert((ival >= 0) && (ival <= 255));
2109
2110 assert(targetReg != maskReg);
2111 assert(targetReg != addrIndexReg);
2112 assert(maskReg != addrIndexReg);
2113 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2114
2115 break;
2116 }
2117
2118 case NI_AVX_TestC:
2119 {
2120 genHWIntrinsic_R_RM(node, ins, attr);
2121 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2122 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2123 break;
2124 }
2125
2126 case NI_AVX_TestNotZAndNotC:
2127 {
2128 genHWIntrinsic_R_RM(node, ins, attr);
2129 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2130 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2131 break;
2132 }
2133
2134 case NI_AVX_TestZ:
2135 {
2136 genHWIntrinsic_R_RM(node, ins, attr);
2137 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2138 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2139 break;
2140 }
2141
2142 case NI_AVX_ExtractVector128:
2143 case NI_AVX_InsertVector128:
2144 case NI_AVX2_ExtractVector128:
2145 case NI_AVX2_InsertVector128:
2146 {
2147 GenTree* lastOp = nullptr;
2148 if (numArgs == 2)
2149 {
2150 assert(intrinsicId == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128);
2151 op2Reg = op2->gtRegNum;
2152 lastOp = op2;
2153 }
2154 else
2155 {
2156 assert(numArgs == 3);
2157 assert(op1->OperIsList());
2158 assert(op1->gtGetOp2()->OperIsList());
2159 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
2160
2161 GenTreeArgList* argList = op1->AsArgList();
2162 op1 = argList->Current();
2163 genConsumeRegs(op1);
2164 op1Reg = op1->gtRegNum;
2165
2166 argList = argList->Rest();
2167 op2 = argList->Current();
2168 genConsumeRegs(op2);
2169 op2Reg = op2->gtRegNum;
2170
2171 argList = argList->Rest();
2172 lastOp = argList->Current();
2173 genConsumeRegs(lastOp);
2174 }
2175
2176 regNumber op3Reg = lastOp->gtRegNum;
2177
2178 auto emitSwCase = [&](int8_t i) {
2179 if (numArgs == 3)
2180 {
2181 if (intrinsicId == NI_AVX_ExtractVector128 || intrinsicId == NI_AVX2_ExtractVector128)
2182 {
2183 emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, i);
2184 }
2185 else if (op2->TypeGet() == TYP_I_IMPL)
2186 {
2187 emit->emitIns_SIMD_R_R_AR_I(ins, attr, targetReg, op1Reg, op2Reg, i);
2188 }
2189 else
2190 {
2191 assert(op2->TypeGet() == TYP_SIMD16);
2192 emit->emitIns_SIMD_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, i);
2193 }
2194 }
2195 else
2196 {
2197 assert(numArgs == 2);
2198 assert(intrinsicId == NI_AVX_ExtractVector128 || intrinsicId == NI_AVX2_ExtractVector128);
2199 emit->emitIns_SIMD_R_R_I(ins, attr, targetReg, op1Reg, i);
2200 }
2201 };
2202
2203 if (lastOp->IsCnsIntOrI())
2204 {
2205 ssize_t ival = lastOp->AsIntCon()->IconValue();
2206 assert((ival >= 0) && (ival <= 255));
2207 emitSwCase((int8_t)ival);
2208 }
2209 else
2210 {
2211 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
2212 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
2213 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
2214 regNumber baseReg = node->ExtractTempReg();
2215 regNumber offsReg = node->GetSingleTempReg();
2216 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
2217 }
2218 break;
2219 }
2220
2221 default:
2222 unreached();
2223 break;
2224 }
2225
2226 genProduceReg(node);
2227}
2228
2229//------------------------------------------------------------------------
2230// genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2231//
2232// Arguments:
2233// node - The hardware intrinsic node
2234//
2235void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2236{
2237 NYI("Implement AES intrinsic code generation");
2238}
2239
2240//------------------------------------------------------------------------
2241// genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2242//
2243// Arguments:
2244// node - The hardware intrinsic node
2245//
2246void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2247{
2248 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2249 regNumber targetReg = node->gtRegNum;
2250 GenTree* op1 = node->gtGetOp1();
2251 GenTree* op2 = node->gtGetOp2();
2252 var_types targetType = node->TypeGet();
2253 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2254 emitter* emit = getEmitter();
2255
2256 assert(targetReg != REG_NA);
2257 assert(op1 != nullptr);
2258
2259 if (!op1->OperIsList())
2260 {
2261 genConsumeOperands(node);
2262 }
2263
2264 switch (intrinsicId)
2265 {
2266 case NI_BMI1_AndNot:
2267 case NI_BMI1_X64_AndNot:
2268 case NI_BMI1_BitFieldExtract:
2269 case NI_BMI1_X64_BitFieldExtract:
2270 case NI_BMI2_ParallelBitDeposit:
2271 case NI_BMI2_ParallelBitExtract:
2272 case NI_BMI2_X64_ParallelBitDeposit:
2273 case NI_BMI2_X64_ParallelBitExtract:
2274 case NI_BMI2_ZeroHighBits:
2275 case NI_BMI2_X64_ZeroHighBits:
2276 {
2277 assert(op2 != nullptr);
2278 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2279 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2280 break;
2281 }
2282
2283 case NI_BMI1_ExtractLowestSetBit:
2284 case NI_BMI1_GetMaskUpToLowestSetBit:
2285 case NI_BMI1_ResetLowestSetBit:
2286 case NI_BMI1_X64_ExtractLowestSetBit:
2287 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2288 case NI_BMI1_X64_ResetLowestSetBit:
2289 {
2290 assert(op2 == nullptr);
2291 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2292 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2293 break;
2294 }
2295
2296 case NI_BMI1_TrailingZeroCount:
2297 case NI_BMI1_X64_TrailingZeroCount:
2298 {
2299 assert(op2 == nullptr);
2300 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2301 genXCNTIntrinsic(node, ins);
2302 break;
2303 }
2304
2305 case NI_BMI2_MultiplyNoFlags:
2306 case NI_BMI2_X64_MultiplyNoFlags:
2307 {
2308 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2309 assert(numArgs == 2 || numArgs == 3);
2310
2311 regNumber op1Reg = REG_NA;
2312 regNumber op2Reg = REG_NA;
2313 regNumber op3Reg = REG_NA;
2314 regNumber lowReg = REG_NA;
2315
2316 if (numArgs == 2)
2317 {
2318 op1Reg = op1->gtRegNum;
2319 op2Reg = op2->gtRegNum;
2320 lowReg = targetReg;
2321 }
2322 else
2323 {
2324 GenTreeArgList* argList = op1->AsArgList();
2325 op1 = argList->Current();
2326 genConsumeRegs(op1);
2327 op1Reg = op1->gtRegNum;
2328 argList = argList->Rest();
2329 op2 = argList->Current();
2330 genConsumeRegs(op2);
2331 op2Reg = op2->gtRegNum;
2332 argList = argList->Rest();
2333 GenTree* op3 = argList->Current();
2334 genConsumeRegs(op3);
2335 op3Reg = op3->gtRegNum;
2336 assert(op3Reg != op1Reg);
2337 assert(op3Reg != targetReg);
2338 assert(op3Reg != REG_EDX);
2339 lowReg = node->GetSingleTempReg();
2340 assert(op3Reg != lowReg);
2341 assert(lowReg != targetReg);
2342 }
2343
2344 emitAttr attr = emitTypeSize(targetType);
2345 // mov the first operand into implicit source operand EDX/RDX
2346 if (op1Reg != REG_EDX)
2347 {
2348 assert(op2Reg != REG_EDX);
2349 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2350 }
2351
2352 // generate code for MULX
2353 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2354
2355 // If requires the lower half result, store in the memory opinted by op3
2356 if (numArgs == 3)
2357 {
2358 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2359 }
2360
2361 break;
2362 }
2363
2364 default:
2365 {
2366 unreached();
2367 break;
2368 }
2369 }
2370
2371 genProduceReg(node);
2372}
2373
2374//------------------------------------------------------------------------
2375// genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2376//
2377// Arguments:
2378// node - The hardware intrinsic node
2379//
2380void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2381{
2382 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2383 var_types baseType = node->gtSIMDBaseType;
2384 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2385 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2386 GenTree* op1 = node->gtGetOp1();
2387 regNumber targetReg = node->gtRegNum;
2388
2389 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2390 assert(op1 != nullptr);
2391 assert(op1->OperIsList());
2392 assert(op1->gtGetOp2()->OperIsList());
2393 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
2394
2395 GenTreeArgList* argList = op1->AsArgList();
2396 op1 = argList->Current();
2397 genConsumeRegs(op1);
2398
2399 argList = argList->Rest();
2400 GenTree* op2 = argList->Current();
2401 genConsumeRegs(op2);
2402
2403 argList = argList->Rest();
2404 GenTree* op3 = argList->Current();
2405 genConsumeRegs(op3);
2406
2407 regNumber op1Reg;
2408 regNumber op2Reg;
2409
2410 bool isCommutative = false;
2411 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2412
2413 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2414 assert(!copiesUpperBits || !op1->isContained());
2415
2416 if (op3->isContained() || op3->isUsedFromSpillTemp())
2417 {
2418 // 213 form: op1 = (op2 * op1) + [op3]
2419
2420 op1Reg = op1->gtRegNum;
2421 op2Reg = op2->gtRegNum;
2422
2423 isCommutative = !copiesUpperBits;
2424 }
2425 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2426 {
2427 // 132 form: op1 = (op1 * op3) + [op2]
2428
2429 ins = (instruction)(ins - 1);
2430 op1Reg = op1->gtRegNum;
2431 op2Reg = op3->gtRegNum;
2432 op3 = op2;
2433 }
2434 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2435 {
2436 // 231 form: op3 = (op2 * op3) + [op1]
2437
2438 ins = (instruction)(ins + 1);
2439 op1Reg = op3->gtRegNum;
2440 op2Reg = op2->gtRegNum;
2441 op3 = op1;
2442 }
2443 else
2444 {
2445 // 213 form: op1 = (op2 * op1) + op3
2446
2447 op1Reg = op1->gtRegNum;
2448 op2Reg = op2->gtRegNum;
2449
2450 isCommutative = !copiesUpperBits;
2451 }
2452
2453 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2454 {
2455 assert(node->isRMWHWIntrinsic(compiler));
2456
2457 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2458 //
2459 // For non-commutative intrinsics, we should have ensured that op2 was marked
2460 // delay free in order to prevent it from getting assigned the same register
2461 // as target. However, for commutative intrinsics, we can just swap the operands
2462 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2463
2464 op2Reg = op1Reg;
2465 op1Reg = targetReg;
2466 }
2467
2468 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2469 genProduceReg(node);
2470}
2471
2472//------------------------------------------------------------------------
2473// genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2474//
2475// Arguments:
2476// node - The hardware intrinsic node
2477//
2478void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2479{
2480 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2481 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2482
2483 genConsumeOperands(node);
2484 genXCNTIntrinsic(node, INS_lzcnt);
2485 genProduceReg(node);
2486}
2487
2488//------------------------------------------------------------------------
2489// genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2490//
2491// Arguments:
2492// node - The hardware intrinsic node
2493//
2494void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2495{
2496 NYI("Implement PCLMULQDQ intrinsic code generation");
2497}
2498
2499//------------------------------------------------------------------------
2500// genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2501//
2502// Arguments:
2503// node - The hardware intrinsic node
2504//
2505void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2506{
2507 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2508
2509 genConsumeOperands(node);
2510 genXCNTIntrinsic(node, INS_popcnt);
2511 genProduceReg(node);
2512}
2513
2514//------------------------------------------------------------------------
2515// genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2516// the target register
2517//
2518// Arguments:
2519// node - The hardware intrinsic node
2520// ins - The instruction being generated
2521//
2522void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2523{
2524 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2525 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2526 // renaming, but only if it's not an actual dependency.
2527
2528 GenTree* op1 = node->gtGetOp1();
2529 regNumber sourceReg1 = REG_NA;
2530 regNumber sourceReg2 = REG_NA;
2531
2532 if (!op1->isContained())
2533 {
2534 sourceReg1 = op1->gtRegNum;
2535 }
2536 else if (op1->isIndir())
2537 {
2538 GenTreeIndir* indir = op1->AsIndir();
2539 GenTree* memBase = indir->Base();
2540
2541 if (memBase != nullptr)
2542 {
2543 sourceReg1 = memBase->gtRegNum;
2544 }
2545
2546 if (indir->HasIndex())
2547 {
2548 sourceReg2 = indir->Index()->gtRegNum;
2549 }
2550 }
2551
2552 regNumber targetReg = node->gtRegNum;
2553 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2554 {
2555 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2556 }
2557 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2558}
2559
2560#endif // FEATURE_HW_INTRINSICS
2561