1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | |
5 | /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
7 | XX XX |
8 | XX Intel hardware intrinsic Code Generator XX |
9 | XX XX |
10 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
11 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
12 | */ |
13 | #include "jitpch.h" |
14 | #ifdef _MSC_VER |
15 | #pragma hdrstop |
16 | #endif |
17 | |
18 | #ifdef FEATURE_HW_INTRINSICS |
19 | |
20 | #include "emit.h" |
21 | #include "codegen.h" |
22 | #include "sideeffects.h" |
23 | #include "lower.h" |
24 | #include "gcinfo.h" |
25 | #include "gcinfoencoder.h" |
26 | |
27 | //------------------------------------------------------------------------ |
28 | // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node |
29 | // |
30 | // Arguments: |
31 | // lowering - The lowering phase from the compiler |
32 | // node - The HWIntrinsic node that has the contained node |
33 | // op - The op that is contained |
34 | // |
35 | static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op) |
36 | { |
37 | #if DEBUG |
38 | // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation |
39 | // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow. |
40 | // |
41 | // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and |
42 | // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack |
43 | // spillage |
44 | // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a |
45 | // register |
46 | // in the first place). |
47 | |
48 | bool supportsRegOptional = false; |
49 | bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional); |
50 | assert(isContainable || supportsRegOptional); |
51 | #endif // DEBUG |
52 | } |
53 | |
54 | //------------------------------------------------------------------------ |
55 | // genIsTableDrivenHWIntrinsic: |
56 | // |
57 | // Arguments: |
58 | // category - category of a HW intrinsic |
59 | // |
60 | // Return Value: |
61 | // returns true if this category can be table-driven in CodeGen |
62 | // |
63 | static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category) |
64 | { |
65 | // TODO - make more categories to the table-driven framework |
66 | // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen |
67 | const bool tableDrivenCategory = |
68 | (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper); |
69 | const bool tableDrivenFlag = |
70 | !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId); |
71 | return tableDrivenCategory && tableDrivenFlag; |
72 | } |
73 | |
74 | //------------------------------------------------------------------------ |
75 | // genHWIntrinsic: Generates the code for a given hardware intrinsic node. |
76 | // |
77 | // Arguments: |
78 | // node - The hardware intrinsic node |
79 | // |
80 | void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) |
81 | { |
82 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
83 | InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId); |
84 | HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId); |
85 | int ival = HWIntrinsicInfo::lookupIval(intrinsicId); |
86 | int numArgs = HWIntrinsicInfo::lookupNumArgs(node); |
87 | |
88 | assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId)); |
89 | |
90 | if (genIsTableDrivenHWIntrinsic(intrinsicId, category)) |
91 | { |
92 | GenTree* op1 = node->gtGetOp1(); |
93 | GenTree* op2 = node->gtGetOp2(); |
94 | regNumber targetReg = node->gtRegNum; |
95 | var_types targetType = node->TypeGet(); |
96 | var_types baseType = node->gtSIMDBaseType; |
97 | |
98 | regNumber op1Reg = REG_NA; |
99 | regNumber op2Reg = REG_NA; |
100 | emitter* emit = getEmitter(); |
101 | |
102 | assert(numArgs >= 0); |
103 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
104 | assert(ins != INS_invalid); |
105 | emitAttr simdSize = EA_ATTR(node->gtSIMDSize); |
106 | assert(simdSize != 0); |
107 | |
108 | switch (numArgs) |
109 | { |
110 | case 1: |
111 | { |
112 | genConsumeOperands(node); |
113 | op1Reg = op1->gtRegNum; |
114 | |
115 | if (node->OperIsMemoryLoad()) |
116 | { |
117 | emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0); |
118 | } |
119 | else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId)) |
120 | { |
121 | emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); |
122 | } |
123 | else if ((ival != -1) && varTypeIsFloating(baseType)) |
124 | { |
125 | assert((ival >= 0) && (ival <= 127)); |
126 | genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival); |
127 | } |
128 | else |
129 | { |
130 | genHWIntrinsic_R_RM(node, ins, simdSize); |
131 | } |
132 | break; |
133 | } |
134 | |
135 | case 2: |
136 | { |
137 | genConsumeOperands(node); |
138 | |
139 | op1Reg = op1->gtRegNum; |
140 | op2Reg = op2->gtRegNum; |
141 | |
142 | if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler)) |
143 | { |
144 | // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic. |
145 | // |
146 | // For non-commutative intrinsics, we should have ensured that op2 was marked |
147 | // delay free in order to prevent it from getting assigned the same register |
148 | // as target. However, for commutative intrinsics, we can just swap the operands |
149 | // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. |
150 | |
151 | noway_assert(node->OperIsCommutative()); |
152 | op2Reg = op1Reg; |
153 | op1Reg = targetReg; |
154 | } |
155 | |
156 | if (category == HW_Category_MemoryStore) |
157 | { |
158 | emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0); |
159 | } |
160 | else if ((ival != -1) && varTypeIsFloating(baseType)) |
161 | { |
162 | assert((ival >= 0) && (ival <= 127)); |
163 | genHWIntrinsic_R_R_RM_I(node, ins, ival); |
164 | } |
165 | else if (category == HW_Category_MemoryLoad) |
166 | { |
167 | if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad) |
168 | { |
169 | emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg); |
170 | } |
171 | else |
172 | { |
173 | emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg); |
174 | } |
175 | } |
176 | else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) |
177 | { |
178 | assert(ival == -1); |
179 | |
180 | if (intrinsicId == NI_SSE2_Extract) |
181 | { |
182 | // extract instructions return to GP-registers, so it needs int size as the emitsize |
183 | simdSize = emitTypeSize(TYP_INT); |
184 | } |
185 | |
186 | auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); }; |
187 | |
188 | if (op2->IsCnsIntOrI()) |
189 | { |
190 | ssize_t ival = op2->AsIntCon()->IconValue(); |
191 | assert((ival >= 0) && (ival <= 255)); |
192 | emitSwCase((int8_t)ival); |
193 | } |
194 | else |
195 | { |
196 | // We emit a fallback case for the scenario when the imm-op is not a constant. This should |
197 | // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it |
198 | // can also occur if the consumer calls it directly and just doesn't pass a constant value. |
199 | regNumber baseReg = node->ExtractTempReg(); |
200 | regNumber offsReg = node->GetSingleTempReg(); |
201 | genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase); |
202 | } |
203 | } |
204 | else |
205 | { |
206 | genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize)); |
207 | } |
208 | break; |
209 | } |
210 | |
211 | case 3: |
212 | { |
213 | assert(op1->OperIsList()); |
214 | assert(op1->gtGetOp2()->OperIsList()); |
215 | assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); |
216 | |
217 | GenTreeArgList* argList = op1->AsArgList(); |
218 | op1 = argList->Current(); |
219 | genConsumeRegs(op1); |
220 | op1Reg = op1->gtRegNum; |
221 | |
222 | argList = argList->Rest(); |
223 | op2 = argList->Current(); |
224 | genConsumeRegs(op2); |
225 | op2Reg = op2->gtRegNum; |
226 | |
227 | argList = argList->Rest(); |
228 | GenTree* op3 = argList->Current(); |
229 | genConsumeRegs(op3); |
230 | regNumber op3Reg = op3->gtRegNum; |
231 | |
232 | if (HWIntrinsicInfo::isImmOp(intrinsicId, op3)) |
233 | { |
234 | assert(ival == -1); |
235 | |
236 | auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); }; |
237 | |
238 | if (op3->IsCnsIntOrI()) |
239 | { |
240 | ssize_t ival = op3->AsIntCon()->IconValue(); |
241 | assert((ival >= 0) && (ival <= 255)); |
242 | emitSwCase((int8_t)ival); |
243 | } |
244 | else |
245 | { |
246 | // We emit a fallback case for the scenario when the imm-op is not a constant. This should |
247 | // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it |
248 | // can also occur if the consumer calls it directly and just doesn't pass a constant value. |
249 | regNumber baseReg = node->ExtractTempReg(); |
250 | regNumber offsReg = node->GetSingleTempReg(); |
251 | genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase); |
252 | } |
253 | } |
254 | else if (category == HW_Category_MemoryStore) |
255 | { |
256 | if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore) |
257 | { |
258 | emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0); |
259 | } |
260 | else |
261 | { |
262 | assert(intrinsicId == NI_SSE2_MaskMove); |
263 | assert(targetReg == REG_NA); |
264 | |
265 | // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI |
266 | if (op3Reg != REG_EDI) |
267 | { |
268 | emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg); |
269 | } |
270 | emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg); |
271 | } |
272 | } |
273 | else |
274 | { |
275 | switch (intrinsicId) |
276 | { |
277 | case NI_SSE41_BlendVariable: |
278 | case NI_AVX_BlendVariable: |
279 | case NI_AVX2_BlendVariable: |
280 | { |
281 | genHWIntrinsic_R_R_RM_R(node, ins); |
282 | break; |
283 | } |
284 | |
285 | default: |
286 | { |
287 | unreached(); |
288 | break; |
289 | }; |
290 | } |
291 | } |
292 | break; |
293 | } |
294 | |
295 | default: |
296 | unreached(); |
297 | break; |
298 | } |
299 | genProduceReg(node); |
300 | return; |
301 | } |
302 | |
303 | switch (isa) |
304 | { |
305 | case InstructionSet_Base: |
306 | genBaseIntrinsic(node); |
307 | break; |
308 | case InstructionSet_SSE: |
309 | case InstructionSet_SSE_X64: |
310 | genSSEIntrinsic(node); |
311 | break; |
312 | case InstructionSet_SSE2: |
313 | case InstructionSet_SSE2_X64: |
314 | genSSE2Intrinsic(node); |
315 | break; |
316 | case InstructionSet_SSE41: |
317 | case InstructionSet_SSE41_X64: |
318 | genSSE41Intrinsic(node); |
319 | break; |
320 | case InstructionSet_SSE42: |
321 | case InstructionSet_SSE42_X64: |
322 | genSSE42Intrinsic(node); |
323 | break; |
324 | case InstructionSet_AVX: |
325 | case InstructionSet_AVX2: |
326 | genAvxOrAvx2Intrinsic(node); |
327 | break; |
328 | case InstructionSet_AES: |
329 | genAESIntrinsic(node); |
330 | break; |
331 | case InstructionSet_BMI1: |
332 | case InstructionSet_BMI1_X64: |
333 | case InstructionSet_BMI2: |
334 | case InstructionSet_BMI2_X64: |
335 | genBMI1OrBMI2Intrinsic(node); |
336 | break; |
337 | case InstructionSet_FMA: |
338 | genFMAIntrinsic(node); |
339 | break; |
340 | case InstructionSet_LZCNT: |
341 | case InstructionSet_LZCNT_X64: |
342 | genLZCNTIntrinsic(node); |
343 | break; |
344 | case InstructionSet_PCLMULQDQ: |
345 | genPCLMULQDQIntrinsic(node); |
346 | break; |
347 | case InstructionSet_POPCNT: |
348 | case InstructionSet_POPCNT_X64: |
349 | genPOPCNTIntrinsic(node); |
350 | break; |
351 | default: |
352 | unreached(); |
353 | break; |
354 | } |
355 | } |
356 | |
357 | //------------------------------------------------------------------------ |
358 | // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a |
359 | // register/memory operand and that returns a value in register |
360 | // |
361 | // Arguments: |
362 | // node - The hardware intrinsic node |
363 | // ins - The instruction being generated |
364 | // attr - The emit attribute for the instruciton being generated |
365 | // |
366 | void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr) |
367 | { |
368 | var_types targetType = node->TypeGet(); |
369 | regNumber targetReg = node->gtRegNum; |
370 | GenTree* op1 = node->gtGetOp1(); |
371 | GenTree* op2 = node->gtGetOp2(); |
372 | emitter* emit = getEmitter(); |
373 | |
374 | if (op2 != nullptr) |
375 | { |
376 | // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this |
377 | // code path. They are all MultiIns, as the return value comes from the flags and |
378 | // we have two operands instead. |
379 | |
380 | assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId)); |
381 | assert(targetReg != REG_NA); |
382 | |
383 | targetReg = op1->gtRegNum; |
384 | op1 = op2; |
385 | op2 = nullptr; |
386 | } |
387 | else |
388 | { |
389 | assert(!node->OperIsCommutative()); |
390 | } |
391 | |
392 | assert(targetReg != REG_NA); |
393 | assert(op2 == nullptr); |
394 | |
395 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
396 | { |
397 | assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); |
398 | assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1); |
399 | |
400 | TempDsc* tmpDsc = nullptr; |
401 | unsigned varNum = BAD_VAR_NUM; |
402 | unsigned offset = (unsigned)-1; |
403 | |
404 | if (op1->isUsedFromSpillTemp()) |
405 | { |
406 | assert(op1->IsRegOptional()); |
407 | |
408 | tmpDsc = getSpillTempDsc(op1); |
409 | varNum = tmpDsc->tdTempNum(); |
410 | offset = 0; |
411 | |
412 | regSet.tmpRlsTemp(tmpDsc); |
413 | } |
414 | else if (op1->OperIsHWIntrinsic()) |
415 | { |
416 | emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0); |
417 | return; |
418 | } |
419 | else if (op1->isIndir()) |
420 | { |
421 | GenTreeIndir* memIndir = op1->AsIndir(); |
422 | GenTree* memBase = memIndir->gtOp1; |
423 | |
424 | switch (memBase->OperGet()) |
425 | { |
426 | case GT_LCL_VAR_ADDR: |
427 | { |
428 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
429 | offset = 0; |
430 | |
431 | // Ensure that all the GenTreeIndir values are set to their defaults. |
432 | assert(!memIndir->HasIndex()); |
433 | assert(memIndir->Scale() == 1); |
434 | assert(memIndir->Offset() == 0); |
435 | |
436 | break; |
437 | } |
438 | |
439 | case GT_CLS_VAR_ADDR: |
440 | { |
441 | emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0); |
442 | return; |
443 | } |
444 | |
445 | default: |
446 | { |
447 | emit->emitIns_R_A(ins, attr, targetReg, memIndir); |
448 | return; |
449 | } |
450 | } |
451 | } |
452 | else |
453 | { |
454 | switch (op1->OperGet()) |
455 | { |
456 | case GT_LCL_FLD: |
457 | { |
458 | GenTreeLclFld* lclField = op1->AsLclFld(); |
459 | |
460 | varNum = lclField->GetLclNum(); |
461 | offset = lclField->gtLclFld.gtLclOffs; |
462 | break; |
463 | } |
464 | |
465 | case GT_LCL_VAR: |
466 | { |
467 | assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate()); |
468 | varNum = op1->AsLclVar()->GetLclNum(); |
469 | offset = 0; |
470 | break; |
471 | } |
472 | |
473 | default: |
474 | { |
475 | unreached(); |
476 | break; |
477 | } |
478 | } |
479 | } |
480 | |
481 | // Ensure we got a good varNum and offset. |
482 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
483 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
484 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
485 | assert(offset != (unsigned)-1); |
486 | |
487 | emit->emitIns_R_S(ins, attr, targetReg, varNum, offset); |
488 | } |
489 | else |
490 | { |
491 | regNumber op1Reg = op1->gtRegNum; |
492 | emit->emitIns_R_R(ins, attr, targetReg, op1Reg); |
493 | } |
494 | } |
495 | |
496 | //------------------------------------------------------------------------ |
497 | // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand, |
498 | // an immediate operand, and that returns a value in register |
499 | // |
500 | // Arguments: |
501 | // node - The hardware intrinsic node |
502 | // ins - The instruction being generated |
503 | // ival - The immediate value |
504 | // |
505 | void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival) |
506 | { |
507 | var_types targetType = node->TypeGet(); |
508 | regNumber targetReg = node->gtRegNum; |
509 | GenTree* op1 = node->gtGetOp1(); |
510 | emitAttr simdSize = EA_ATTR(node->gtSIMDSize); |
511 | emitter* emit = getEmitter(); |
512 | |
513 | // TODO-XArch-CQ: Commutative operations can have op1 be contained |
514 | // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained |
515 | |
516 | assert(targetReg != REG_NA); |
517 | assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative |
518 | |
519 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
520 | { |
521 | assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); |
522 | assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1); |
523 | |
524 | TempDsc* tmpDsc = nullptr; |
525 | unsigned varNum = BAD_VAR_NUM; |
526 | unsigned offset = (unsigned)-1; |
527 | |
528 | if (op1->isUsedFromSpillTemp()) |
529 | { |
530 | assert(op1->IsRegOptional()); |
531 | |
532 | tmpDsc = getSpillTempDsc(op1); |
533 | varNum = tmpDsc->tdTempNum(); |
534 | offset = 0; |
535 | |
536 | regSet.tmpRlsTemp(tmpDsc); |
537 | } |
538 | else if (op1->OperIsHWIntrinsic()) |
539 | { |
540 | emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival); |
541 | return; |
542 | } |
543 | else if (op1->isIndir()) |
544 | { |
545 | GenTreeIndir* memIndir = op1->AsIndir(); |
546 | GenTree* memBase = memIndir->gtOp1; |
547 | |
548 | switch (memBase->OperGet()) |
549 | { |
550 | case GT_LCL_VAR_ADDR: |
551 | { |
552 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
553 | offset = 0; |
554 | |
555 | // Ensure that all the GenTreeIndir values are set to their defaults. |
556 | assert(!memIndir->HasIndex()); |
557 | assert(memIndir->Scale() == 1); |
558 | assert(memIndir->Offset() == 0); |
559 | |
560 | break; |
561 | } |
562 | |
563 | case GT_CLS_VAR_ADDR: |
564 | { |
565 | emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival); |
566 | return; |
567 | } |
568 | |
569 | default: |
570 | { |
571 | emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival); |
572 | return; |
573 | } |
574 | } |
575 | } |
576 | else |
577 | { |
578 | switch (op1->OperGet()) |
579 | { |
580 | case GT_LCL_FLD: |
581 | { |
582 | GenTreeLclFld* lclField = op1->AsLclFld(); |
583 | |
584 | varNum = lclField->GetLclNum(); |
585 | offset = lclField->gtLclFld.gtLclOffs; |
586 | break; |
587 | } |
588 | |
589 | case GT_LCL_VAR: |
590 | { |
591 | assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate()); |
592 | varNum = op1->AsLclVar()->GetLclNum(); |
593 | offset = 0; |
594 | break; |
595 | } |
596 | |
597 | default: |
598 | unreached(); |
599 | break; |
600 | } |
601 | } |
602 | |
603 | // Ensure we got a good varNum and offset. |
604 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
605 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
606 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
607 | assert(offset != (unsigned)-1); |
608 | |
609 | emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival); |
610 | } |
611 | else |
612 | { |
613 | regNumber op1Reg = op1->gtRegNum; |
614 | emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival); |
615 | } |
616 | } |
617 | |
618 | //------------------------------------------------------------------------ |
619 | // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a |
620 | // register/memory operand, and that returns a value in register |
621 | // |
622 | // Arguments: |
623 | // node - The hardware intrinsic node |
624 | // ins - The instruction being generated |
625 | // attr - The emit attribute for the instruciton being generated |
626 | // |
627 | void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr) |
628 | { |
629 | regNumber targetReg = node->gtRegNum; |
630 | GenTree* op1 = node->gtGetOp1(); |
631 | GenTree* op2 = node->gtGetOp2(); |
632 | regNumber op1Reg = op1->gtRegNum; |
633 | |
634 | assert(targetReg != REG_NA); |
635 | assert(op1Reg != REG_NA); |
636 | |
637 | genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2); |
638 | } |
639 | |
640 | //------------------------------------------------------------------------ |
641 | // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a |
642 | // register/memory operand, and that returns a value in register |
643 | // |
644 | // Arguments: |
645 | // node - The hardware intrinsic node |
646 | // ins - The instruction being generated |
647 | // attr - The emit attribute for the instruciton being generated |
648 | // targetReg - The register allocated to the result |
649 | // op1Reg - The register allocated to the first operand |
650 | // op2 - Another operand that maybe in register or memory |
651 | // |
652 | void CodeGen::genHWIntrinsic_R_R_RM( |
653 | GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2) |
654 | { |
655 | emitter* emit = getEmitter(); |
656 | |
657 | // TODO-XArch-CQ: Commutative operations can have op1 be contained |
658 | // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained |
659 | |
660 | assert(targetReg != REG_NA); |
661 | assert(op1Reg != REG_NA); |
662 | |
663 | if (op2->isContained() || op2->isUsedFromSpillTemp()) |
664 | { |
665 | assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); |
666 | assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); |
667 | |
668 | TempDsc* tmpDsc = nullptr; |
669 | unsigned varNum = BAD_VAR_NUM; |
670 | unsigned offset = (unsigned)-1; |
671 | |
672 | if (op2->isUsedFromSpillTemp()) |
673 | { |
674 | assert(op2->IsRegOptional()); |
675 | |
676 | tmpDsc = getSpillTempDsc(op2); |
677 | varNum = tmpDsc->tdTempNum(); |
678 | offset = 0; |
679 | |
680 | regSet.tmpRlsTemp(tmpDsc); |
681 | } |
682 | else if (op2->OperIsHWIntrinsic()) |
683 | { |
684 | emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum); |
685 | return; |
686 | } |
687 | else if (op2->isIndir()) |
688 | { |
689 | GenTreeIndir* memIndir = op2->AsIndir(); |
690 | GenTree* memBase = memIndir->gtOp1; |
691 | |
692 | switch (memBase->OperGet()) |
693 | { |
694 | case GT_LCL_VAR_ADDR: |
695 | { |
696 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
697 | offset = 0; |
698 | |
699 | // Ensure that all the GenTreeIndir values are set to their defaults. |
700 | assert(!memIndir->HasIndex()); |
701 | assert(memIndir->Scale() == 1); |
702 | assert(memIndir->Offset() == 0); |
703 | |
704 | break; |
705 | } |
706 | |
707 | case GT_CLS_VAR_ADDR: |
708 | { |
709 | emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0); |
710 | return; |
711 | } |
712 | |
713 | default: |
714 | { |
715 | emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir); |
716 | return; |
717 | } |
718 | } |
719 | } |
720 | else |
721 | { |
722 | switch (op2->OperGet()) |
723 | { |
724 | case GT_LCL_FLD: |
725 | { |
726 | GenTreeLclFld* lclField = op2->AsLclFld(); |
727 | |
728 | varNum = lclField->GetLclNum(); |
729 | offset = lclField->gtLclFld.gtLclOffs; |
730 | break; |
731 | } |
732 | |
733 | case GT_LCL_VAR: |
734 | { |
735 | assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate()); |
736 | varNum = op2->AsLclVar()->GetLclNum(); |
737 | offset = 0; |
738 | break; |
739 | } |
740 | |
741 | default: |
742 | unreached(); |
743 | break; |
744 | } |
745 | } |
746 | |
747 | // Ensure we got a good varNum and offset. |
748 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
749 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
750 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
751 | assert(offset != (unsigned)-1); |
752 | |
753 | emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset); |
754 | } |
755 | else |
756 | { |
757 | regNumber op2Reg = op2->gtRegNum; |
758 | |
759 | if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler)) |
760 | { |
761 | // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic. |
762 | // |
763 | // For non-commutative intrinsics, we should have ensured that op2 was marked |
764 | // delay free in order to prevent it from getting assigned the same register |
765 | // as target. However, for commutative intrinsics, we can just swap the operands |
766 | // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. |
767 | |
768 | noway_assert(node->OperIsCommutative()); |
769 | op2Reg = op1Reg; |
770 | op1Reg = targetReg; |
771 | } |
772 | |
773 | emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg); |
774 | } |
775 | } |
776 | |
777 | //------------------------------------------------------------------------ |
778 | // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a |
779 | // register/memory operand, an immediate operand, and that returns a value in register |
780 | // |
781 | // Arguments: |
782 | // node - The hardware intrinsic node |
783 | // ins - The instruction being generated |
784 | // ival - The immediate value |
785 | // |
786 | void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival) |
787 | { |
788 | var_types targetType = node->TypeGet(); |
789 | regNumber targetReg = node->gtRegNum; |
790 | GenTree* op1 = node->gtGetOp1(); |
791 | GenTree* op2 = node->gtGetOp2(); |
792 | emitAttr simdSize = EA_ATTR(node->gtSIMDSize); |
793 | emitter* emit = getEmitter(); |
794 | |
795 | // TODO-XArch-CQ: Commutative operations can have op1 be contained |
796 | // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained |
797 | |
798 | if (op1->OperIsList()) |
799 | { |
800 | assert(op2 == nullptr); |
801 | |
802 | GenTreeArgList* argList = op1->AsArgList(); |
803 | |
804 | op1 = argList->Current(); |
805 | argList = argList->Rest(); |
806 | |
807 | op2 = argList->Current(); |
808 | argList = argList->Rest(); |
809 | |
810 | assert(argList->Current() != nullptr); |
811 | assert(argList->Rest() == nullptr); |
812 | } |
813 | |
814 | regNumber op1Reg = op1->gtRegNum; |
815 | |
816 | assert(targetReg != REG_NA); |
817 | assert(op1Reg != REG_NA); |
818 | |
819 | if (op2->isContained() || op2->isUsedFromSpillTemp()) |
820 | { |
821 | assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); |
822 | assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); |
823 | |
824 | TempDsc* tmpDsc = nullptr; |
825 | unsigned varNum = BAD_VAR_NUM; |
826 | unsigned offset = (unsigned)-1; |
827 | |
828 | if (op2->isUsedFromSpillTemp()) |
829 | { |
830 | assert(op2->IsRegOptional()); |
831 | |
832 | tmpDsc = getSpillTempDsc(op2); |
833 | varNum = tmpDsc->tdTempNum(); |
834 | offset = 0; |
835 | |
836 | regSet.tmpRlsTemp(tmpDsc); |
837 | } |
838 | else if (op2->OperIsHWIntrinsic()) |
839 | { |
840 | emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival); |
841 | return; |
842 | } |
843 | else if (op2->isIndir()) |
844 | { |
845 | GenTreeIndir* memIndir = op2->AsIndir(); |
846 | GenTree* memBase = memIndir->gtOp1; |
847 | |
848 | switch (memBase->OperGet()) |
849 | { |
850 | case GT_LCL_VAR_ADDR: |
851 | { |
852 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
853 | offset = 0; |
854 | |
855 | // Ensure that all the GenTreeIndir values are set to their defaults. |
856 | assert(!memIndir->HasIndex()); |
857 | assert(memIndir->Scale() == 1); |
858 | assert(memIndir->Offset() == 0); |
859 | |
860 | break; |
861 | } |
862 | |
863 | case GT_CLS_VAR_ADDR: |
864 | { |
865 | emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, |
866 | ival); |
867 | return; |
868 | } |
869 | |
870 | default: |
871 | { |
872 | emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival); |
873 | return; |
874 | } |
875 | } |
876 | } |
877 | else |
878 | { |
879 | switch (op2->OperGet()) |
880 | { |
881 | case GT_LCL_FLD: |
882 | { |
883 | GenTreeLclFld* lclField = op2->AsLclFld(); |
884 | |
885 | varNum = lclField->GetLclNum(); |
886 | offset = lclField->gtLclFld.gtLclOffs; |
887 | break; |
888 | } |
889 | |
890 | case GT_LCL_VAR: |
891 | { |
892 | assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate()); |
893 | varNum = op2->AsLclVar()->GetLclNum(); |
894 | offset = 0; |
895 | break; |
896 | } |
897 | |
898 | default: |
899 | unreached(); |
900 | break; |
901 | } |
902 | } |
903 | |
904 | // Ensure we got a good varNum and offset. |
905 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
906 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
907 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
908 | assert(offset != (unsigned)-1); |
909 | |
910 | emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival); |
911 | } |
912 | else |
913 | { |
914 | regNumber op2Reg = op2->gtRegNum; |
915 | |
916 | if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler)) |
917 | { |
918 | // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic. |
919 | // |
920 | // For non-commutative intrinsics, we should have ensured that op2 was marked |
921 | // delay free in order to prevent it from getting assigned the same register |
922 | // as target. However, for commutative intrinsics, we can just swap the operands |
923 | // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. |
924 | |
925 | noway_assert(node->OperIsCommutative()); |
926 | op2Reg = op1Reg; |
927 | op1Reg = targetReg; |
928 | } |
929 | |
930 | emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival); |
931 | } |
932 | } |
933 | |
934 | //------------------------------------------------------------------------ |
935 | // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a |
936 | // register/memory operand, another register operand, and that returns a value in register |
937 | // |
938 | // Arguments: |
939 | // node - The hardware intrinsic node |
940 | // ins - The instruction being generated |
941 | // |
942 | void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins) |
943 | { |
944 | var_types targetType = node->TypeGet(); |
945 | regNumber targetReg = node->gtRegNum; |
946 | GenTree* op1 = node->gtGetOp1(); |
947 | GenTree* op2 = node->gtGetOp2(); |
948 | GenTree* op3 = nullptr; |
949 | emitAttr simdSize = EA_ATTR(node->gtSIMDSize); |
950 | emitter* emit = getEmitter(); |
951 | |
952 | assert(op1->OperIsList()); |
953 | assert(op2 == nullptr); |
954 | |
955 | GenTreeArgList* argList = op1->AsArgList(); |
956 | |
957 | op1 = argList->Current(); |
958 | argList = argList->Rest(); |
959 | |
960 | op2 = argList->Current(); |
961 | argList = argList->Rest(); |
962 | |
963 | op3 = argList->Current(); |
964 | assert(argList->Rest() == nullptr); |
965 | |
966 | regNumber op1Reg = op1->gtRegNum; |
967 | regNumber op3Reg = op3->gtRegNum; |
968 | |
969 | assert(targetReg != REG_NA); |
970 | assert(op1Reg != REG_NA); |
971 | assert(op3Reg != REG_NA); |
972 | |
973 | if (op2->isContained() || op2->isUsedFromSpillTemp()) |
974 | { |
975 | assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); |
976 | assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); |
977 | |
978 | TempDsc* tmpDsc = nullptr; |
979 | unsigned varNum = BAD_VAR_NUM; |
980 | unsigned offset = (unsigned)-1; |
981 | |
982 | if (op2->isUsedFromSpillTemp()) |
983 | { |
984 | assert(op2->IsRegOptional()); |
985 | |
986 | // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common |
987 | // pattern. It could probably be extracted to its own method. |
988 | tmpDsc = getSpillTempDsc(op2); |
989 | varNum = tmpDsc->tdTempNum(); |
990 | offset = 0; |
991 | |
992 | regSet.tmpRlsTemp(tmpDsc); |
993 | } |
994 | else if (op2->OperIsHWIntrinsic()) |
995 | { |
996 | emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum); |
997 | return; |
998 | } |
999 | else if (op2->isIndir()) |
1000 | { |
1001 | GenTreeIndir* memIndir = op2->AsIndir(); |
1002 | GenTree* memBase = memIndir->gtOp1; |
1003 | |
1004 | switch (memBase->OperGet()) |
1005 | { |
1006 | case GT_LCL_VAR_ADDR: |
1007 | { |
1008 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
1009 | offset = 0; |
1010 | |
1011 | // Ensure that all the GenTreeIndir values are set to their defaults. |
1012 | assert(!memIndir->HasIndex()); |
1013 | assert(memIndir->Scale() == 1); |
1014 | assert(memIndir->Offset() == 0); |
1015 | |
1016 | break; |
1017 | } |
1018 | |
1019 | case GT_CLS_VAR_ADDR: |
1020 | { |
1021 | emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd, |
1022 | 0); |
1023 | return; |
1024 | } |
1025 | |
1026 | default: |
1027 | { |
1028 | emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir); |
1029 | return; |
1030 | } |
1031 | } |
1032 | } |
1033 | else |
1034 | { |
1035 | switch (op2->OperGet()) |
1036 | { |
1037 | case GT_LCL_FLD: |
1038 | { |
1039 | GenTreeLclFld* lclField = op2->AsLclFld(); |
1040 | |
1041 | varNum = lclField->GetLclNum(); |
1042 | offset = lclField->gtLclFld.gtLclOffs; |
1043 | break; |
1044 | } |
1045 | |
1046 | case GT_LCL_VAR: |
1047 | { |
1048 | assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate()); |
1049 | varNum = op2->AsLclVar()->GetLclNum(); |
1050 | offset = 0; |
1051 | break; |
1052 | } |
1053 | |
1054 | default: |
1055 | unreached(); |
1056 | break; |
1057 | } |
1058 | } |
1059 | |
1060 | // Ensure we got a good varNum and offset. |
1061 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
1062 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
1063 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
1064 | assert(offset != (unsigned)-1); |
1065 | |
1066 | emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset); |
1067 | } |
1068 | else |
1069 | { |
1070 | emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg); |
1071 | } |
1072 | } |
1073 | |
1074 | //------------------------------------------------------------------------ |
1075 | // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands, |
1076 | // a register/memory operand, and that returns a value in register |
1077 | // |
1078 | // Arguments: |
1079 | // ins - The instruction being generated |
1080 | // attr - The emit attribute |
1081 | // targetReg - The target register |
1082 | // op1Reg - The register of the first operand |
1083 | // op2Reg - The register of the second operand |
1084 | // op3 - The third operand |
1085 | // |
1086 | void CodeGen::genHWIntrinsic_R_R_R_RM( |
1087 | instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3) |
1088 | { |
1089 | assert(targetReg != REG_NA); |
1090 | assert(op1Reg != REG_NA); |
1091 | assert(op2Reg != REG_NA); |
1092 | |
1093 | emitter* emit = getEmitter(); |
1094 | |
1095 | if (op3->isContained() || op3->isUsedFromSpillTemp()) |
1096 | { |
1097 | TempDsc* tmpDsc = nullptr; |
1098 | unsigned varNum = BAD_VAR_NUM; |
1099 | unsigned offset = (unsigned)-1; |
1100 | |
1101 | if (op3->isUsedFromSpillTemp()) |
1102 | { |
1103 | assert(op3->IsRegOptional()); |
1104 | |
1105 | // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common |
1106 | // pattern. It could probably be extracted to its own method. |
1107 | tmpDsc = getSpillTempDsc(op3); |
1108 | varNum = tmpDsc->tdTempNum(); |
1109 | offset = 0; |
1110 | |
1111 | regSet.tmpRlsTemp(tmpDsc); |
1112 | } |
1113 | else if (op3->OperIsHWIntrinsic()) |
1114 | { |
1115 | emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum); |
1116 | return; |
1117 | } |
1118 | else if (op3->isIndir()) |
1119 | { |
1120 | GenTreeIndir* memIndir = op3->AsIndir(); |
1121 | GenTree* memBase = memIndir->gtOp1; |
1122 | |
1123 | switch (memBase->OperGet()) |
1124 | { |
1125 | case GT_LCL_VAR_ADDR: |
1126 | { |
1127 | varNum = memBase->AsLclVarCommon()->GetLclNum(); |
1128 | offset = 0; |
1129 | |
1130 | // Ensure that all the GenTreeIndir values are set to their defaults. |
1131 | assert(!memIndir->HasIndex()); |
1132 | assert(memIndir->Scale() == 1); |
1133 | assert(memIndir->Offset() == 0); |
1134 | |
1135 | break; |
1136 | } |
1137 | |
1138 | case GT_CLS_VAR_ADDR: |
1139 | { |
1140 | emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0); |
1141 | return; |
1142 | } |
1143 | |
1144 | default: |
1145 | { |
1146 | emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir); |
1147 | return; |
1148 | } |
1149 | } |
1150 | } |
1151 | else |
1152 | { |
1153 | switch (op3->OperGet()) |
1154 | { |
1155 | case GT_LCL_FLD: |
1156 | { |
1157 | GenTreeLclFld* lclField = op3->AsLclFld(); |
1158 | |
1159 | varNum = lclField->GetLclNum(); |
1160 | offset = lclField->gtLclFld.gtLclOffs; |
1161 | break; |
1162 | } |
1163 | |
1164 | case GT_LCL_VAR: |
1165 | { |
1166 | assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate()); |
1167 | varNum = op3->AsLclVar()->GetLclNum(); |
1168 | offset = 0; |
1169 | break; |
1170 | } |
1171 | |
1172 | default: |
1173 | unreached(); |
1174 | break; |
1175 | } |
1176 | } |
1177 | |
1178 | // Ensure we got a good varNum and offset. |
1179 | // We also need to check for `tmpDsc != nullptr` since spill temp numbers |
1180 | // are negative and start with -1, which also happens to be BAD_VAR_NUM. |
1181 | assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); |
1182 | assert(offset != (unsigned)-1); |
1183 | |
1184 | emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset); |
1185 | } |
1186 | else |
1187 | { |
1188 | emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum); |
1189 | } |
1190 | } |
1191 | |
1192 | // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics |
1193 | // with non-constant argument |
1194 | // |
1195 | // Arguments: |
1196 | // intrinsic - intrinsic ID |
1197 | // nonConstImmReg - the register contains non-constant imm8 argument |
1198 | // baseReg - a register for the start of the switch table |
1199 | // offsReg - a register for the offset into the switch table |
1200 | // emitSwCase - the lambda to generate siwtch-case |
1201 | // |
1202 | // Return Value: |
1203 | // generate the jump-table fallback for imm-intrinsics with non-constant argument. |
1204 | // Note: |
1205 | // This function can be used for all imm-intrinsics (whether full-range or not), |
1206 | // The compiler front-end (i.e. importer) is responsible to insert a range-check IR |
1207 | // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check. |
1208 | // |
1209 | template <typename HWIntrinsicSwitchCaseBody> |
1210 | void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic, |
1211 | regNumber nonConstImmReg, |
1212 | regNumber baseReg, |
1213 | regNumber offsReg, |
1214 | HWIntrinsicSwitchCaseBody emitSwCase) |
1215 | { |
1216 | assert(nonConstImmReg != REG_NA); |
1217 | // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range |
1218 | // that does work with the current compiler generated jump-table fallback |
1219 | assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic)); |
1220 | emitter* emit = getEmitter(); |
1221 | |
1222 | const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1; |
1223 | assert(maxByte <= 256); |
1224 | BasicBlock* jmpTable[256]; |
1225 | |
1226 | unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true); |
1227 | unsigned jmpTableOffs = 0; |
1228 | |
1229 | // Emit the jump table |
1230 | for (unsigned i = 0; i < maxByte; i++) |
1231 | { |
1232 | jmpTable[i] = genCreateTempLabel(); |
1233 | emit->emitDataGenData(i, jmpTable[i]); |
1234 | } |
1235 | |
1236 | emit->emitDataGenEnd(); |
1237 | |
1238 | // Compute and jump to the appropriate offset in the switch table |
1239 | emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0); |
1240 | |
1241 | emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0); |
1242 | emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg); |
1243 | emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg); |
1244 | emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg); |
1245 | |
1246 | // Emit the switch table entries |
1247 | |
1248 | BasicBlock* switchTableBeg = genCreateTempLabel(); |
1249 | BasicBlock* switchTableEnd = genCreateTempLabel(); |
1250 | |
1251 | genDefineTempLabel(switchTableBeg); |
1252 | |
1253 | for (unsigned i = 0; i < maxByte; i++) |
1254 | { |
1255 | genDefineTempLabel(jmpTable[i]); |
1256 | emitSwCase((int8_t)i); |
1257 | emit->emitIns_J(INS_jmp, switchTableEnd); |
1258 | } |
1259 | |
1260 | genDefineTempLabel(switchTableEnd); |
1261 | } |
1262 | |
1263 | //------------------------------------------------------------------------ |
1264 | // genBaseIntrinsic: Generates the code for a base hardware intrinsic node |
1265 | // |
1266 | // Arguments: |
1267 | // node - The hardware intrinsic node |
1268 | // |
1269 | // Note: |
1270 | // We currently assume that all base intrinsics only have a single operand. |
1271 | // |
1272 | void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) |
1273 | { |
1274 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1275 | regNumber targetReg = node->gtRegNum; |
1276 | var_types targetType = node->TypeGet(); |
1277 | var_types baseType = node->gtSIMDBaseType; |
1278 | |
1279 | assert(compiler->compSupports(InstructionSet_SSE)); |
1280 | assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE)); |
1281 | |
1282 | GenTree* op1 = node->gtGetOp1(); |
1283 | regNumber op1Reg = REG_NA; |
1284 | |
1285 | if (op1 != nullptr) |
1286 | { |
1287 | assert(!op1->OperIsList()); |
1288 | op1Reg = op1->gtRegNum; |
1289 | genConsumeOperands(node); |
1290 | } |
1291 | |
1292 | assert(node->gtGetOp2() == nullptr); |
1293 | |
1294 | emitter* emit = getEmitter(); |
1295 | emitAttr attr = EA_ATTR(node->gtSIMDSize); |
1296 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1297 | |
1298 | switch (intrinsicId) |
1299 | { |
1300 | case NI_Base_Vector128_CreateScalarUnsafe: |
1301 | case NI_Base_Vector256_CreateScalarUnsafe: |
1302 | { |
1303 | if (varTypeIsIntegral(baseType)) |
1304 | { |
1305 | genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType)); |
1306 | } |
1307 | else |
1308 | { |
1309 | assert(varTypeIsFloating(baseType)); |
1310 | |
1311 | attr = emitTypeSize(baseType); |
1312 | |
1313 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
1314 | { |
1315 | genHWIntrinsic_R_RM(node, ins, attr); |
1316 | } |
1317 | else if (targetReg != op1Reg) |
1318 | { |
1319 | // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs |
1320 | emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); |
1321 | } |
1322 | } |
1323 | break; |
1324 | } |
1325 | |
1326 | case NI_Base_Vector128_ToScalar: |
1327 | case NI_Base_Vector256_ToScalar: |
1328 | { |
1329 | assert(varTypeIsFloating(baseType)); |
1330 | |
1331 | attr = emitTypeSize(TYP_SIMD16); |
1332 | |
1333 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
1334 | { |
1335 | genHWIntrinsic_R_RM(node, ins, attr); |
1336 | } |
1337 | else if (targetReg != op1Reg) |
1338 | { |
1339 | // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs |
1340 | emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); |
1341 | } |
1342 | break; |
1343 | } |
1344 | |
1345 | case NI_Base_Vector128_ToVector256: |
1346 | { |
1347 | // ToVector256 has zero-extend semantics in order to ensure it is deterministic |
1348 | // We always emit a move to the target register, even when op1Reg == targetReg, |
1349 | // in order to ensure that Bits MAXVL-1:128 are zeroed. |
1350 | |
1351 | attr = emitTypeSize(TYP_SIMD16); |
1352 | |
1353 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
1354 | { |
1355 | genHWIntrinsic_R_RM(node, ins, attr); |
1356 | } |
1357 | else |
1358 | { |
1359 | // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs |
1360 | emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); |
1361 | } |
1362 | break; |
1363 | } |
1364 | |
1365 | case NI_Base_Vector128_ToVector256Unsafe: |
1366 | case NI_Base_Vector256_GetLower: |
1367 | { |
1368 | if (op1->isContained() || op1->isUsedFromSpillTemp()) |
1369 | { |
1370 | genHWIntrinsic_R_RM(node, ins, attr); |
1371 | } |
1372 | else if (targetReg != op1Reg) |
1373 | { |
1374 | // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs |
1375 | emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); |
1376 | } |
1377 | break; |
1378 | } |
1379 | |
1380 | case NI_Base_Vector128_Zero: |
1381 | case NI_Base_Vector256_Zero: |
1382 | { |
1383 | assert(op1 == nullptr); |
1384 | emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg); |
1385 | break; |
1386 | } |
1387 | |
1388 | default: |
1389 | { |
1390 | unreached(); |
1391 | break; |
1392 | } |
1393 | } |
1394 | |
1395 | genProduceReg(node); |
1396 | } |
1397 | |
1398 | //------------------------------------------------------------------------ |
1399 | // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node |
1400 | // |
1401 | // Arguments: |
1402 | // node - The hardware intrinsic node |
1403 | // |
1404 | void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) |
1405 | { |
1406 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1407 | GenTree* op1 = node->gtGetOp1(); |
1408 | GenTree* op2 = node->gtGetOp2(); |
1409 | GenTree* op3 = nullptr; |
1410 | GenTree* op4 = nullptr; |
1411 | regNumber targetReg = node->gtRegNum; |
1412 | var_types targetType = node->TypeGet(); |
1413 | var_types baseType = node->gtSIMDBaseType; |
1414 | |
1415 | regNumber op1Reg = REG_NA; |
1416 | regNumber op2Reg = REG_NA; |
1417 | regNumber op3Reg = REG_NA; |
1418 | regNumber op4Reg = REG_NA; |
1419 | emitter* emit = getEmitter(); |
1420 | |
1421 | if ((op1 != nullptr) && !op1->OperIsList()) |
1422 | { |
1423 | op1Reg = op1->gtRegNum; |
1424 | genConsumeOperands(node); |
1425 | } |
1426 | |
1427 | switch (intrinsicId) |
1428 | { |
1429 | case NI_SSE_CompareEqualOrderedScalar: |
1430 | case NI_SSE_CompareEqualUnorderedScalar: |
1431 | { |
1432 | assert(baseType == TYP_FLOAT); |
1433 | regNumber tmpReg = node->GetSingleTempReg(); |
1434 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1435 | |
1436 | // Ensure we aren't overwriting targetReg |
1437 | assert(tmpReg != targetReg); |
1438 | |
1439 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1440 | emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg); |
1441 | emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg); |
1442 | emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg); |
1443 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg); |
1444 | break; |
1445 | } |
1446 | |
1447 | case NI_SSE_CompareGreaterThanOrderedScalar: |
1448 | case NI_SSE_CompareGreaterThanUnorderedScalar: |
1449 | { |
1450 | assert(baseType == TYP_FLOAT); |
1451 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1452 | |
1453 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1454 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
1455 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1456 | break; |
1457 | } |
1458 | |
1459 | case NI_SSE_CompareGreaterThanOrEqualOrderedScalar: |
1460 | case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar: |
1461 | { |
1462 | assert(baseType == TYP_FLOAT); |
1463 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1464 | |
1465 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1466 | emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); |
1467 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1468 | break; |
1469 | } |
1470 | |
1471 | case NI_SSE_CompareLessThanOrderedScalar: |
1472 | case NI_SSE_CompareLessThanUnorderedScalar: |
1473 | { |
1474 | assert(baseType == TYP_FLOAT); |
1475 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1476 | |
1477 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1478 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
1479 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1480 | break; |
1481 | } |
1482 | |
1483 | case NI_SSE_CompareLessThanOrEqualOrderedScalar: |
1484 | case NI_SSE_CompareLessThanOrEqualUnorderedScalar: |
1485 | { |
1486 | assert(baseType == TYP_FLOAT); |
1487 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1488 | |
1489 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1490 | emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); |
1491 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1492 | break; |
1493 | } |
1494 | |
1495 | case NI_SSE_CompareNotEqualOrderedScalar: |
1496 | case NI_SSE_CompareNotEqualUnorderedScalar: |
1497 | { |
1498 | assert(baseType == TYP_FLOAT); |
1499 | regNumber tmpReg = node->GetSingleTempReg(); |
1500 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1501 | |
1502 | // Ensure we aren't overwriting targetReg |
1503 | assert(tmpReg != targetReg); |
1504 | |
1505 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1506 | emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg); |
1507 | emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg); |
1508 | emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg); |
1509 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg); |
1510 | break; |
1511 | } |
1512 | |
1513 | case NI_SSE_X64_ConvertScalarToVector128Single: |
1514 | { |
1515 | assert(baseType == TYP_LONG); |
1516 | assert(op1 != nullptr); |
1517 | assert(op2 != nullptr); |
1518 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1519 | genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE); |
1520 | break; |
1521 | } |
1522 | |
1523 | case NI_SSE_MoveMask: |
1524 | { |
1525 | assert(baseType == TYP_FLOAT); |
1526 | assert(op2 == nullptr); |
1527 | |
1528 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1529 | emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg); |
1530 | break; |
1531 | } |
1532 | |
1533 | case NI_SSE_Prefetch0: |
1534 | case NI_SSE_Prefetch1: |
1535 | case NI_SSE_Prefetch2: |
1536 | case NI_SSE_PrefetchNonTemporal: |
1537 | { |
1538 | assert(baseType == TYP_UBYTE); |
1539 | assert(op2 == nullptr); |
1540 | |
1541 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); |
1542 | emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0); |
1543 | break; |
1544 | } |
1545 | |
1546 | case NI_SSE_StoreFence: |
1547 | { |
1548 | assert(baseType == TYP_VOID); |
1549 | assert(op1 == nullptr); |
1550 | assert(op2 == nullptr); |
1551 | emit->emitIns(INS_sfence); |
1552 | break; |
1553 | } |
1554 | |
1555 | default: |
1556 | unreached(); |
1557 | break; |
1558 | } |
1559 | |
1560 | genProduceReg(node); |
1561 | } |
1562 | |
1563 | //------------------------------------------------------------------------ |
1564 | // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node |
1565 | // |
1566 | // Arguments: |
1567 | // node - The hardware intrinsic node |
1568 | // |
1569 | void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) |
1570 | { |
1571 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1572 | GenTree* op1 = node->gtGetOp1(); |
1573 | GenTree* op2 = node->gtGetOp2(); |
1574 | regNumber targetReg = node->gtRegNum; |
1575 | var_types targetType = node->TypeGet(); |
1576 | var_types baseType = node->gtSIMDBaseType; |
1577 | regNumber op1Reg = REG_NA; |
1578 | regNumber op2Reg = REG_NA; |
1579 | emitter* emit = getEmitter(); |
1580 | |
1581 | if ((op1 != nullptr) && !op1->OperIsList()) |
1582 | { |
1583 | op1Reg = op1->gtRegNum; |
1584 | genConsumeOperands(node); |
1585 | } |
1586 | |
1587 | switch (intrinsicId) |
1588 | { |
1589 | // All integer overloads are handled by table codegen |
1590 | case NI_SSE2_CompareLessThan: |
1591 | { |
1592 | assert(op1 != nullptr); |
1593 | assert(op2 != nullptr); |
1594 | |
1595 | assert(baseType == TYP_DOUBLE); |
1596 | |
1597 | int ival = HWIntrinsicInfo::lookupIval(intrinsicId); |
1598 | assert((ival >= 0) && (ival <= 127)); |
1599 | |
1600 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1601 | op2Reg = op2->gtRegNum; |
1602 | emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival); |
1603 | |
1604 | break; |
1605 | } |
1606 | |
1607 | case NI_SSE2_CompareEqualOrderedScalar: |
1608 | case NI_SSE2_CompareEqualUnorderedScalar: |
1609 | { |
1610 | assert(baseType == TYP_DOUBLE); |
1611 | regNumber tmpReg = node->GetSingleTempReg(); |
1612 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1613 | |
1614 | // Ensure we aren't overwriting targetReg |
1615 | assert(tmpReg != targetReg); |
1616 | |
1617 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1618 | emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg); |
1619 | emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg); |
1620 | emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg); |
1621 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg); |
1622 | break; |
1623 | } |
1624 | |
1625 | case NI_SSE2_CompareGreaterThanOrderedScalar: |
1626 | case NI_SSE2_CompareGreaterThanUnorderedScalar: |
1627 | { |
1628 | assert(baseType == TYP_DOUBLE); |
1629 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1630 | |
1631 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1632 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
1633 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1634 | break; |
1635 | } |
1636 | |
1637 | case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar: |
1638 | case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar: |
1639 | { |
1640 | assert(baseType == TYP_DOUBLE); |
1641 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1642 | |
1643 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1644 | emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); |
1645 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1646 | break; |
1647 | } |
1648 | |
1649 | case NI_SSE2_CompareLessThanOrderedScalar: |
1650 | case NI_SSE2_CompareLessThanUnorderedScalar: |
1651 | { |
1652 | assert(baseType == TYP_DOUBLE); |
1653 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1654 | |
1655 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1656 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
1657 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1658 | break; |
1659 | } |
1660 | |
1661 | case NI_SSE2_CompareLessThanOrEqualOrderedScalar: |
1662 | case NI_SSE2_CompareLessThanOrEqualUnorderedScalar: |
1663 | { |
1664 | assert(baseType == TYP_DOUBLE); |
1665 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1666 | |
1667 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1668 | emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); |
1669 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1670 | break; |
1671 | } |
1672 | |
1673 | case NI_SSE2_CompareNotEqualOrderedScalar: |
1674 | case NI_SSE2_CompareNotEqualUnorderedScalar: |
1675 | { |
1676 | assert(baseType == TYP_DOUBLE); |
1677 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1678 | regNumber tmpReg = node->GetSingleTempReg(); |
1679 | |
1680 | // Ensure we aren't overwriting targetReg |
1681 | assert(tmpReg != targetReg); |
1682 | |
1683 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16)); |
1684 | emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg); |
1685 | emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg); |
1686 | emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg); |
1687 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg); |
1688 | break; |
1689 | } |
1690 | |
1691 | case NI_SSE2_X64_ConvertScalarToVector128Double: |
1692 | { |
1693 | assert(baseType == TYP_LONG); |
1694 | assert(op1 != nullptr); |
1695 | assert(op2 != nullptr); |
1696 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1697 | genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE); |
1698 | break; |
1699 | } |
1700 | |
1701 | case NI_SSE2_X64_ConvertScalarToVector128Int64: |
1702 | case NI_SSE2_X64_ConvertScalarToVector128UInt64: |
1703 | { |
1704 | assert(baseType == TYP_LONG || baseType == TYP_ULONG); |
1705 | assert(op1 != nullptr); |
1706 | assert(op2 == nullptr); |
1707 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1708 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType)); |
1709 | break; |
1710 | } |
1711 | |
1712 | case NI_SSE2_ConvertToInt32: |
1713 | case NI_SSE2_ConvertToInt32WithTruncation: |
1714 | case NI_SSE2_ConvertToUInt32: |
1715 | case NI_SSE2_X64_ConvertToUInt64: |
1716 | case NI_SSE2_X64_ConvertToInt64: |
1717 | { |
1718 | assert(op2 == nullptr); |
1719 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1720 | |
1721 | if (varTypeIsIntegral(baseType)) |
1722 | { |
1723 | assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG); |
1724 | emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg); |
1725 | } |
1726 | else |
1727 | { |
1728 | assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT); |
1729 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType)); |
1730 | } |
1731 | break; |
1732 | } |
1733 | |
1734 | case NI_SSE2_LoadFence: |
1735 | { |
1736 | assert(baseType == TYP_VOID); |
1737 | assert(op1 == nullptr); |
1738 | assert(op2 == nullptr); |
1739 | emit->emitIns(INS_lfence); |
1740 | break; |
1741 | } |
1742 | |
1743 | case NI_SSE2_MemoryFence: |
1744 | { |
1745 | assert(baseType == TYP_VOID); |
1746 | assert(op1 == nullptr); |
1747 | assert(op2 == nullptr); |
1748 | emit->emitIns(INS_mfence); |
1749 | break; |
1750 | } |
1751 | |
1752 | case NI_SSE2_MoveMask: |
1753 | { |
1754 | assert(op2 == nullptr); |
1755 | assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE); |
1756 | |
1757 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1758 | emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg); |
1759 | break; |
1760 | } |
1761 | |
1762 | case NI_SSE2_StoreNonTemporal: |
1763 | case NI_SSE2_X64_StoreNonTemporal: |
1764 | { |
1765 | assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG); |
1766 | assert(op1 != nullptr); |
1767 | assert(op2 != nullptr); |
1768 | |
1769 | op2Reg = op2->gtRegNum; |
1770 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1771 | emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0); |
1772 | break; |
1773 | } |
1774 | |
1775 | default: |
1776 | unreached(); |
1777 | break; |
1778 | } |
1779 | |
1780 | genProduceReg(node); |
1781 | } |
1782 | |
1783 | //------------------------------------------------------------------------ |
1784 | // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node |
1785 | // |
1786 | // Arguments: |
1787 | // node - The hardware intrinsic node |
1788 | // |
1789 | void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) |
1790 | { |
1791 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1792 | GenTree* op1 = node->gtGetOp1(); |
1793 | GenTree* op2 = node->gtGetOp2(); |
1794 | GenTree* op3 = nullptr; |
1795 | GenTree* op4 = nullptr; |
1796 | regNumber targetReg = node->gtRegNum; |
1797 | var_types targetType = node->TypeGet(); |
1798 | var_types baseType = node->gtSIMDBaseType; |
1799 | |
1800 | regNumber op1Reg = REG_NA; |
1801 | regNumber op2Reg = REG_NA; |
1802 | regNumber op3Reg = REG_NA; |
1803 | regNumber op4Reg = REG_NA; |
1804 | emitter* emit = getEmitter(); |
1805 | |
1806 | if ((op1 != nullptr) && !op1->OperIsList()) |
1807 | { |
1808 | op1Reg = op1->gtRegNum; |
1809 | genConsumeOperands(node); |
1810 | } |
1811 | |
1812 | switch (intrinsicId) |
1813 | { |
1814 | case NI_SSE41_TestAllOnes: |
1815 | { |
1816 | regNumber tmpReg = node->GetSingleTempReg(); |
1817 | assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest); |
1818 | emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg); |
1819 | emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg); |
1820 | emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); |
1821 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1822 | break; |
1823 | } |
1824 | |
1825 | case NI_SSE41_TestAllZeros: |
1826 | case NI_SSE41_TestZ: |
1827 | { |
1828 | assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest); |
1829 | genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16)); |
1830 | emit->emitIns_R(INS_sete, EA_1BYTE, targetReg); |
1831 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1832 | break; |
1833 | } |
1834 | |
1835 | case NI_SSE41_TestC: |
1836 | { |
1837 | assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest); |
1838 | genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16)); |
1839 | emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); |
1840 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1841 | break; |
1842 | } |
1843 | |
1844 | case NI_SSE41_TestMixOnesZeros: |
1845 | case NI_SSE41_TestNotZAndNotC: |
1846 | { |
1847 | assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest); |
1848 | genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16)); |
1849 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
1850 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
1851 | break; |
1852 | } |
1853 | |
1854 | case NI_SSE41_Extract: |
1855 | case NI_SSE41_X64_Extract: |
1856 | { |
1857 | regNumber tmpTargetReg = REG_NA; |
1858 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1859 | if (baseType == TYP_FLOAT) |
1860 | { |
1861 | tmpTargetReg = node->ExtractTempReg(); |
1862 | } |
1863 | |
1864 | auto emitSwCase = [&](int8_t i) { |
1865 | if (baseType == TYP_FLOAT) |
1866 | { |
1867 | // extract instructions return to GP-registers, so it needs int size as the emitsize |
1868 | emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i); |
1869 | emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg); |
1870 | } |
1871 | else |
1872 | { |
1873 | emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i); |
1874 | } |
1875 | }; |
1876 | |
1877 | if (op2->IsCnsIntOrI()) |
1878 | { |
1879 | ssize_t ival = op2->AsIntCon()->IconValue(); |
1880 | assert((ival >= 0) && (ival <= 255)); |
1881 | emitSwCase((int8_t)ival); |
1882 | } |
1883 | else |
1884 | { |
1885 | // We emit a fallback case for the scenario when the imm-op is not a constant. This should |
1886 | // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it |
1887 | // can also occur if the consumer calls it directly and just doesn't pass a constant value. |
1888 | regNumber baseReg = node->ExtractTempReg(); |
1889 | regNumber offsReg = node->GetSingleTempReg(); |
1890 | genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase); |
1891 | } |
1892 | break; |
1893 | } |
1894 | |
1895 | default: |
1896 | unreached(); |
1897 | break; |
1898 | } |
1899 | |
1900 | genProduceReg(node); |
1901 | } |
1902 | |
1903 | //------------------------------------------------------------------------ |
1904 | // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node |
1905 | // |
1906 | // Arguments: |
1907 | // node - The hardware intrinsic node |
1908 | // |
1909 | void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node) |
1910 | { |
1911 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1912 | regNumber targetReg = node->gtRegNum; |
1913 | GenTree* op1 = node->gtGetOp1(); |
1914 | GenTree* op2 = node->gtGetOp2(); |
1915 | var_types baseType = node->gtSIMDBaseType; |
1916 | var_types targetType = node->TypeGet(); |
1917 | emitter* emit = getEmitter(); |
1918 | |
1919 | regNumber op1Reg = op1->gtRegNum; |
1920 | genConsumeOperands(node); |
1921 | |
1922 | assert(targetReg != REG_NA); |
1923 | assert(op1Reg != REG_NA); |
1924 | assert(op2 != nullptr); |
1925 | assert(!node->OperIsCommutative()); |
1926 | |
1927 | switch (intrinsicId) |
1928 | { |
1929 | case NI_SSE42_Crc32: |
1930 | case NI_SSE42_X64_Crc32: |
1931 | { |
1932 | if (op1Reg != targetReg) |
1933 | { |
1934 | assert(op2->gtRegNum != targetReg); |
1935 | emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg); |
1936 | } |
1937 | |
1938 | // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an |
1939 | // overload that explicitly takes the operands. |
1940 | node->gtOp1 = op2; |
1941 | node->gtOp2 = nullptr; |
1942 | |
1943 | if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument |
1944 | { |
1945 | assert(targetType == TYP_INT); |
1946 | genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType)); |
1947 | } |
1948 | else |
1949 | { |
1950 | assert(op1->TypeGet() == op2->TypeGet()); |
1951 | assert((targetType == TYP_INT) || (targetType == TYP_LONG)); |
1952 | genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType)); |
1953 | } |
1954 | |
1955 | break; |
1956 | } |
1957 | |
1958 | default: |
1959 | { |
1960 | unreached(); |
1961 | break; |
1962 | } |
1963 | } |
1964 | |
1965 | genProduceReg(node); |
1966 | } |
1967 | |
1968 | //------------------------------------------------------------------------ |
1969 | // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node |
1970 | // |
1971 | // Arguments: |
1972 | // node - The hardware intrinsic node |
1973 | // |
1974 | void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) |
1975 | { |
1976 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
1977 | var_types baseType = node->gtSIMDBaseType; |
1978 | emitAttr attr = EA_ATTR(node->gtSIMDSize); |
1979 | var_types targetType = node->TypeGet(); |
1980 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
1981 | int numArgs = HWIntrinsicInfo::lookupNumArgs(node); |
1982 | GenTree* op1 = node->gtGetOp1(); |
1983 | GenTree* op2 = node->gtGetOp2(); |
1984 | regNumber op1Reg = REG_NA; |
1985 | regNumber op2Reg = REG_NA; |
1986 | regNumber targetReg = node->gtRegNum; |
1987 | emitter* emit = getEmitter(); |
1988 | |
1989 | if ((op1 != nullptr) && !op1->OperIsList()) |
1990 | { |
1991 | op1Reg = op1->gtRegNum; |
1992 | genConsumeOperands(node); |
1993 | } |
1994 | |
1995 | switch (intrinsicId) |
1996 | { |
1997 | case NI_AVX2_ConvertToInt32: |
1998 | case NI_AVX2_ConvertToUInt32: |
1999 | { |
2000 | assert(op2 == nullptr); |
2001 | assert((baseType == TYP_INT) || (baseType == TYP_UINT)); |
2002 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
2003 | emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg); |
2004 | break; |
2005 | } |
2006 | |
2007 | case NI_AVX2_GatherVector128: |
2008 | case NI_AVX2_GatherVector256: |
2009 | case NI_AVX2_GatherMaskVector128: |
2010 | case NI_AVX2_GatherMaskVector256: |
2011 | { |
2012 | GenTreeArgList* list = op1->AsArgList(); |
2013 | op1 = list->Current(); |
2014 | op1Reg = op1->gtRegNum; |
2015 | genConsumeRegs(op1); |
2016 | |
2017 | list = list->Rest(); |
2018 | op2 = list->Current(); |
2019 | op2Reg = op2->gtRegNum; |
2020 | genConsumeRegs(op2); |
2021 | |
2022 | list = list->Rest(); |
2023 | GenTree* op3 = list->Current(); |
2024 | genConsumeRegs(op3); |
2025 | |
2026 | list = list->Rest(); |
2027 | GenTree* op4 = nullptr; |
2028 | GenTree* lastOp = nullptr; |
2029 | GenTree* indexOp = nullptr; |
2030 | |
2031 | regNumber op3Reg = REG_NA; |
2032 | regNumber op4Reg = REG_NA; |
2033 | regNumber addrBaseReg = REG_NA; |
2034 | regNumber addrIndexReg = REG_NA; |
2035 | regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT); |
2036 | |
2037 | if (numArgs == 5) |
2038 | { |
2039 | assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256); |
2040 | op4 = list->Current(); |
2041 | list = list->Rest(); |
2042 | lastOp = list->Current(); |
2043 | op3Reg = op3->gtRegNum; |
2044 | op4Reg = op4->gtRegNum; |
2045 | genConsumeRegs(op4); |
2046 | addrBaseReg = op2Reg; |
2047 | addrIndexReg = op3Reg; |
2048 | indexOp = op3; |
2049 | |
2050 | // copy op4Reg into the tmp mask register, |
2051 | // the mask register will be cleared by gather instructions |
2052 | emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg); |
2053 | |
2054 | if (targetReg != op1Reg) |
2055 | { |
2056 | // copy source vector to the target register for masking merge |
2057 | emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); |
2058 | } |
2059 | } |
2060 | else |
2061 | { |
2062 | assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256); |
2063 | addrBaseReg = op1Reg; |
2064 | addrIndexReg = op2Reg; |
2065 | indexOp = op2; |
2066 | lastOp = op3; |
2067 | |
2068 | // generate all-one mask vector |
2069 | emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg); |
2070 | } |
2071 | |
2072 | bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32); |
2073 | |
2074 | // hwintrinsiclistxarch.h uses Dword index instructions in default |
2075 | if (varTypeIsLong(node->gtIndexBaseType)) |
2076 | { |
2077 | switch (ins) |
2078 | { |
2079 | case INS_vpgatherdd: |
2080 | ins = INS_vpgatherqd; |
2081 | if (isVector128GatherWithVector256Index) |
2082 | { |
2083 | // YMM index in address mode |
2084 | attr = emitTypeSize(TYP_SIMD32); |
2085 | } |
2086 | break; |
2087 | case INS_vpgatherdq: |
2088 | ins = INS_vpgatherqq; |
2089 | break; |
2090 | case INS_vgatherdps: |
2091 | ins = INS_vgatherqps; |
2092 | if (isVector128GatherWithVector256Index) |
2093 | { |
2094 | // YMM index in address mode |
2095 | attr = emitTypeSize(TYP_SIMD32); |
2096 | } |
2097 | break; |
2098 | case INS_vgatherdpd: |
2099 | ins = INS_vgatherqpd; |
2100 | break; |
2101 | default: |
2102 | unreached(); |
2103 | } |
2104 | } |
2105 | |
2106 | assert(lastOp->IsCnsIntOrI()); |
2107 | ssize_t ival = lastOp->AsIntCon()->IconValue(); |
2108 | assert((ival >= 0) && (ival <= 255)); |
2109 | |
2110 | assert(targetReg != maskReg); |
2111 | assert(targetReg != addrIndexReg); |
2112 | assert(maskReg != addrIndexReg); |
2113 | emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0); |
2114 | |
2115 | break; |
2116 | } |
2117 | |
2118 | case NI_AVX_TestC: |
2119 | { |
2120 | genHWIntrinsic_R_RM(node, ins, attr); |
2121 | emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); |
2122 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
2123 | break; |
2124 | } |
2125 | |
2126 | case NI_AVX_TestNotZAndNotC: |
2127 | { |
2128 | genHWIntrinsic_R_RM(node, ins, attr); |
2129 | emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); |
2130 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
2131 | break; |
2132 | } |
2133 | |
2134 | case NI_AVX_TestZ: |
2135 | { |
2136 | genHWIntrinsic_R_RM(node, ins, attr); |
2137 | emit->emitIns_R(INS_sete, EA_1BYTE, targetReg); |
2138 | emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); |
2139 | break; |
2140 | } |
2141 | |
2142 | case NI_AVX_ExtractVector128: |
2143 | case NI_AVX_InsertVector128: |
2144 | case NI_AVX2_ExtractVector128: |
2145 | case NI_AVX2_InsertVector128: |
2146 | { |
2147 | GenTree* lastOp = nullptr; |
2148 | if (numArgs == 2) |
2149 | { |
2150 | assert(intrinsicId == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128); |
2151 | op2Reg = op2->gtRegNum; |
2152 | lastOp = op2; |
2153 | } |
2154 | else |
2155 | { |
2156 | assert(numArgs == 3); |
2157 | assert(op1->OperIsList()); |
2158 | assert(op1->gtGetOp2()->OperIsList()); |
2159 | assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); |
2160 | |
2161 | GenTreeArgList* argList = op1->AsArgList(); |
2162 | op1 = argList->Current(); |
2163 | genConsumeRegs(op1); |
2164 | op1Reg = op1->gtRegNum; |
2165 | |
2166 | argList = argList->Rest(); |
2167 | op2 = argList->Current(); |
2168 | genConsumeRegs(op2); |
2169 | op2Reg = op2->gtRegNum; |
2170 | |
2171 | argList = argList->Rest(); |
2172 | lastOp = argList->Current(); |
2173 | genConsumeRegs(lastOp); |
2174 | } |
2175 | |
2176 | regNumber op3Reg = lastOp->gtRegNum; |
2177 | |
2178 | auto emitSwCase = [&](int8_t i) { |
2179 | if (numArgs == 3) |
2180 | { |
2181 | if (intrinsicId == NI_AVX_ExtractVector128 || intrinsicId == NI_AVX2_ExtractVector128) |
2182 | { |
2183 | emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, i); |
2184 | } |
2185 | else if (op2->TypeGet() == TYP_I_IMPL) |
2186 | { |
2187 | emit->emitIns_SIMD_R_R_AR_I(ins, attr, targetReg, op1Reg, op2Reg, i); |
2188 | } |
2189 | else |
2190 | { |
2191 | assert(op2->TypeGet() == TYP_SIMD16); |
2192 | emit->emitIns_SIMD_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, i); |
2193 | } |
2194 | } |
2195 | else |
2196 | { |
2197 | assert(numArgs == 2); |
2198 | assert(intrinsicId == NI_AVX_ExtractVector128 || intrinsicId == NI_AVX2_ExtractVector128); |
2199 | emit->emitIns_SIMD_R_R_I(ins, attr, targetReg, op1Reg, i); |
2200 | } |
2201 | }; |
2202 | |
2203 | if (lastOp->IsCnsIntOrI()) |
2204 | { |
2205 | ssize_t ival = lastOp->AsIntCon()->IconValue(); |
2206 | assert((ival >= 0) && (ival <= 255)); |
2207 | emitSwCase((int8_t)ival); |
2208 | } |
2209 | else |
2210 | { |
2211 | // We emit a fallback case for the scenario when the imm-op is not a constant. This should |
2212 | // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it |
2213 | // can also occur if the consumer calls it directly and just doesn't pass a constant value. |
2214 | regNumber baseReg = node->ExtractTempReg(); |
2215 | regNumber offsReg = node->GetSingleTempReg(); |
2216 | genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase); |
2217 | } |
2218 | break; |
2219 | } |
2220 | |
2221 | default: |
2222 | unreached(); |
2223 | break; |
2224 | } |
2225 | |
2226 | genProduceReg(node); |
2227 | } |
2228 | |
2229 | //------------------------------------------------------------------------ |
2230 | // genAESIntrinsic: Generates the code for an AES hardware intrinsic node |
2231 | // |
2232 | // Arguments: |
2233 | // node - The hardware intrinsic node |
2234 | // |
2235 | void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node) |
2236 | { |
2237 | NYI("Implement AES intrinsic code generation" ); |
2238 | } |
2239 | |
2240 | //------------------------------------------------------------------------ |
2241 | // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node |
2242 | // |
2243 | // Arguments: |
2244 | // node - The hardware intrinsic node |
2245 | // |
2246 | void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) |
2247 | { |
2248 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
2249 | regNumber targetReg = node->gtRegNum; |
2250 | GenTree* op1 = node->gtGetOp1(); |
2251 | GenTree* op2 = node->gtGetOp2(); |
2252 | var_types targetType = node->TypeGet(); |
2253 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType); |
2254 | emitter* emit = getEmitter(); |
2255 | |
2256 | assert(targetReg != REG_NA); |
2257 | assert(op1 != nullptr); |
2258 | |
2259 | if (!op1->OperIsList()) |
2260 | { |
2261 | genConsumeOperands(node); |
2262 | } |
2263 | |
2264 | switch (intrinsicId) |
2265 | { |
2266 | case NI_BMI1_AndNot: |
2267 | case NI_BMI1_X64_AndNot: |
2268 | case NI_BMI1_BitFieldExtract: |
2269 | case NI_BMI1_X64_BitFieldExtract: |
2270 | case NI_BMI2_ParallelBitDeposit: |
2271 | case NI_BMI2_ParallelBitExtract: |
2272 | case NI_BMI2_X64_ParallelBitDeposit: |
2273 | case NI_BMI2_X64_ParallelBitExtract: |
2274 | case NI_BMI2_ZeroHighBits: |
2275 | case NI_BMI2_X64_ZeroHighBits: |
2276 | { |
2277 | assert(op2 != nullptr); |
2278 | assert((targetType == TYP_INT) || (targetType == TYP_LONG)); |
2279 | genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet())); |
2280 | break; |
2281 | } |
2282 | |
2283 | case NI_BMI1_ExtractLowestSetBit: |
2284 | case NI_BMI1_GetMaskUpToLowestSetBit: |
2285 | case NI_BMI1_ResetLowestSetBit: |
2286 | case NI_BMI1_X64_ExtractLowestSetBit: |
2287 | case NI_BMI1_X64_GetMaskUpToLowestSetBit: |
2288 | case NI_BMI1_X64_ResetLowestSetBit: |
2289 | { |
2290 | assert(op2 == nullptr); |
2291 | assert((targetType == TYP_INT) || (targetType == TYP_LONG)); |
2292 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet())); |
2293 | break; |
2294 | } |
2295 | |
2296 | case NI_BMI1_TrailingZeroCount: |
2297 | case NI_BMI1_X64_TrailingZeroCount: |
2298 | { |
2299 | assert(op2 == nullptr); |
2300 | assert((targetType == TYP_INT) || (targetType == TYP_LONG)); |
2301 | genXCNTIntrinsic(node, ins); |
2302 | break; |
2303 | } |
2304 | |
2305 | case NI_BMI2_MultiplyNoFlags: |
2306 | case NI_BMI2_X64_MultiplyNoFlags: |
2307 | { |
2308 | int numArgs = HWIntrinsicInfo::lookupNumArgs(node); |
2309 | assert(numArgs == 2 || numArgs == 3); |
2310 | |
2311 | regNumber op1Reg = REG_NA; |
2312 | regNumber op2Reg = REG_NA; |
2313 | regNumber op3Reg = REG_NA; |
2314 | regNumber lowReg = REG_NA; |
2315 | |
2316 | if (numArgs == 2) |
2317 | { |
2318 | op1Reg = op1->gtRegNum; |
2319 | op2Reg = op2->gtRegNum; |
2320 | lowReg = targetReg; |
2321 | } |
2322 | else |
2323 | { |
2324 | GenTreeArgList* argList = op1->AsArgList(); |
2325 | op1 = argList->Current(); |
2326 | genConsumeRegs(op1); |
2327 | op1Reg = op1->gtRegNum; |
2328 | argList = argList->Rest(); |
2329 | op2 = argList->Current(); |
2330 | genConsumeRegs(op2); |
2331 | op2Reg = op2->gtRegNum; |
2332 | argList = argList->Rest(); |
2333 | GenTree* op3 = argList->Current(); |
2334 | genConsumeRegs(op3); |
2335 | op3Reg = op3->gtRegNum; |
2336 | assert(op3Reg != op1Reg); |
2337 | assert(op3Reg != targetReg); |
2338 | assert(op3Reg != REG_EDX); |
2339 | lowReg = node->GetSingleTempReg(); |
2340 | assert(op3Reg != lowReg); |
2341 | assert(lowReg != targetReg); |
2342 | } |
2343 | |
2344 | emitAttr attr = emitTypeSize(targetType); |
2345 | // mov the first operand into implicit source operand EDX/RDX |
2346 | if (op1Reg != REG_EDX) |
2347 | { |
2348 | assert(op2Reg != REG_EDX); |
2349 | emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg); |
2350 | } |
2351 | |
2352 | // generate code for MULX |
2353 | genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2); |
2354 | |
2355 | // If requires the lower half result, store in the memory opinted by op3 |
2356 | if (numArgs == 3) |
2357 | { |
2358 | emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0); |
2359 | } |
2360 | |
2361 | break; |
2362 | } |
2363 | |
2364 | default: |
2365 | { |
2366 | unreached(); |
2367 | break; |
2368 | } |
2369 | } |
2370 | |
2371 | genProduceReg(node); |
2372 | } |
2373 | |
2374 | //------------------------------------------------------------------------ |
2375 | // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node |
2376 | // |
2377 | // Arguments: |
2378 | // node - The hardware intrinsic node |
2379 | // |
2380 | void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node) |
2381 | { |
2382 | NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; |
2383 | var_types baseType = node->gtSIMDBaseType; |
2384 | emitAttr attr = EA_ATTR(node->gtSIMDSize); |
2385 | instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); |
2386 | GenTree* op1 = node->gtGetOp1(); |
2387 | regNumber targetReg = node->gtRegNum; |
2388 | |
2389 | assert(HWIntrinsicInfo::lookupNumArgs(node) == 3); |
2390 | assert(op1 != nullptr); |
2391 | assert(op1->OperIsList()); |
2392 | assert(op1->gtGetOp2()->OperIsList()); |
2393 | assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); |
2394 | |
2395 | GenTreeArgList* argList = op1->AsArgList(); |
2396 | op1 = argList->Current(); |
2397 | genConsumeRegs(op1); |
2398 | |
2399 | argList = argList->Rest(); |
2400 | GenTree* op2 = argList->Current(); |
2401 | genConsumeRegs(op2); |
2402 | |
2403 | argList = argList->Rest(); |
2404 | GenTree* op3 = argList->Current(); |
2405 | genConsumeRegs(op3); |
2406 | |
2407 | regNumber op1Reg; |
2408 | regNumber op2Reg; |
2409 | |
2410 | bool isCommutative = false; |
2411 | const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId); |
2412 | |
2413 | // Intrinsics with CopyUpperBits semantics cannot have op1 be contained |
2414 | assert(!copiesUpperBits || !op1->isContained()); |
2415 | |
2416 | if (op3->isContained() || op3->isUsedFromSpillTemp()) |
2417 | { |
2418 | // 213 form: op1 = (op2 * op1) + [op3] |
2419 | |
2420 | op1Reg = op1->gtRegNum; |
2421 | op2Reg = op2->gtRegNum; |
2422 | |
2423 | isCommutative = !copiesUpperBits; |
2424 | } |
2425 | else if (op2->isContained() || op2->isUsedFromSpillTemp()) |
2426 | { |
2427 | // 132 form: op1 = (op1 * op3) + [op2] |
2428 | |
2429 | ins = (instruction)(ins - 1); |
2430 | op1Reg = op1->gtRegNum; |
2431 | op2Reg = op3->gtRegNum; |
2432 | op3 = op2; |
2433 | } |
2434 | else if (op1->isContained() || op1->isUsedFromSpillTemp()) |
2435 | { |
2436 | // 231 form: op3 = (op2 * op3) + [op1] |
2437 | |
2438 | ins = (instruction)(ins + 1); |
2439 | op1Reg = op3->gtRegNum; |
2440 | op2Reg = op2->gtRegNum; |
2441 | op3 = op1; |
2442 | } |
2443 | else |
2444 | { |
2445 | // 213 form: op1 = (op2 * op1) + op3 |
2446 | |
2447 | op1Reg = op1->gtRegNum; |
2448 | op2Reg = op2->gtRegNum; |
2449 | |
2450 | isCommutative = !copiesUpperBits; |
2451 | } |
2452 | |
2453 | if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg)) |
2454 | { |
2455 | assert(node->isRMWHWIntrinsic(compiler)); |
2456 | |
2457 | // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic. |
2458 | // |
2459 | // For non-commutative intrinsics, we should have ensured that op2 was marked |
2460 | // delay free in order to prevent it from getting assigned the same register |
2461 | // as target. However, for commutative intrinsics, we can just swap the operands |
2462 | // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. |
2463 | |
2464 | op2Reg = op1Reg; |
2465 | op1Reg = targetReg; |
2466 | } |
2467 | |
2468 | genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3); |
2469 | genProduceReg(node); |
2470 | } |
2471 | |
2472 | //------------------------------------------------------------------------ |
2473 | // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node |
2474 | // |
2475 | // Arguments: |
2476 | // node - The hardware intrinsic node |
2477 | // |
2478 | void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node) |
2479 | { |
2480 | assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount || |
2481 | node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount); |
2482 | |
2483 | genConsumeOperands(node); |
2484 | genXCNTIntrinsic(node, INS_lzcnt); |
2485 | genProduceReg(node); |
2486 | } |
2487 | |
2488 | //------------------------------------------------------------------------ |
2489 | // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node |
2490 | // |
2491 | // Arguments: |
2492 | // node - The hardware intrinsic node |
2493 | // |
2494 | void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node) |
2495 | { |
2496 | NYI("Implement PCLMULQDQ intrinsic code generation" ); |
2497 | } |
2498 | |
2499 | //------------------------------------------------------------------------ |
2500 | // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node |
2501 | // |
2502 | // Arguments: |
2503 | // node - The hardware intrinsic node |
2504 | // |
2505 | void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node) |
2506 | { |
2507 | assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount); |
2508 | |
2509 | genConsumeOperands(node); |
2510 | genXCNTIntrinsic(node, INS_popcnt); |
2511 | genProduceReg(node); |
2512 | } |
2513 | |
2514 | //------------------------------------------------------------------------ |
2515 | // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on |
2516 | // the target register |
2517 | // |
2518 | // Arguments: |
2519 | // node - The hardware intrinsic node |
2520 | // ins - The instruction being generated |
2521 | // |
2522 | void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins) |
2523 | { |
2524 | // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake |
2525 | // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register |
2526 | // renaming, but only if it's not an actual dependency. |
2527 | |
2528 | GenTree* op1 = node->gtGetOp1(); |
2529 | regNumber sourceReg1 = REG_NA; |
2530 | regNumber sourceReg2 = REG_NA; |
2531 | |
2532 | if (!op1->isContained()) |
2533 | { |
2534 | sourceReg1 = op1->gtRegNum; |
2535 | } |
2536 | else if (op1->isIndir()) |
2537 | { |
2538 | GenTreeIndir* indir = op1->AsIndir(); |
2539 | GenTree* memBase = indir->Base(); |
2540 | |
2541 | if (memBase != nullptr) |
2542 | { |
2543 | sourceReg1 = memBase->gtRegNum; |
2544 | } |
2545 | |
2546 | if (indir->HasIndex()) |
2547 | { |
2548 | sourceReg2 = indir->Index()->gtRegNum; |
2549 | } |
2550 | } |
2551 | |
2552 | regNumber targetReg = node->gtRegNum; |
2553 | if ((targetReg != sourceReg1) && (targetReg != sourceReg2)) |
2554 | { |
2555 | getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); |
2556 | } |
2557 | genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet())); |
2558 | } |
2559 | |
2560 | #endif // FEATURE_HW_INTRINSICS |
2561 | |