1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | |
5 | /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
7 | XX XX |
8 | XX Register Requirements for AMD64 XX |
9 | XX XX |
10 | XX This encapsulates all the logic for setting register requirements for XX |
11 | XX the AMD64 architecture. XX |
12 | XX XX |
13 | XX XX |
14 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
15 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
16 | */ |
17 | |
18 | #include "jitpch.h" |
19 | #ifdef _MSC_VER |
20 | #pragma hdrstop |
21 | #endif |
22 | |
23 | #ifdef _TARGET_XARCH_ |
24 | |
25 | #include "jit.h" |
26 | #include "sideeffects.h" |
27 | #include "lower.h" |
28 | |
29 | //------------------------------------------------------------------------ |
30 | // BuildNode: Build the RefPositions for for a node |
31 | // |
32 | // Arguments: |
33 | // treeNode - the node of interest |
34 | // |
35 | // Return Value: |
36 | // The number of sources consumed by this node. |
37 | // |
38 | // Notes: |
39 | // Preconditions: |
40 | // LSRA Has been initialized. |
41 | // |
42 | // Postconditions: |
43 | // RefPositions have been built for all the register defs and uses required |
44 | // for this node. |
45 | // |
46 | int LinearScan::BuildNode(GenTree* tree) |
47 | { |
48 | assert(!tree->isContained()); |
49 | Interval* prefSrcInterval = nullptr; |
50 | int srcCount; |
51 | int dstCount = 0; |
52 | regMaskTP dstCandidates = RBM_NONE; |
53 | regMaskTP killMask = RBM_NONE; |
54 | bool isLocalDefUse = false; |
55 | |
56 | // Reset the build-related members of LinearScan. |
57 | clearBuildState(); |
58 | |
59 | // Set the default dstCount. This may be modified below. |
60 | if (tree->IsValue()) |
61 | { |
62 | dstCount = 1; |
63 | if (tree->IsUnusedValue()) |
64 | { |
65 | isLocalDefUse = true; |
66 | } |
67 | } |
68 | else |
69 | { |
70 | dstCount = 0; |
71 | } |
72 | |
73 | // floating type generates AVX instruction (vmovss etc.), set the flag |
74 | SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet())); |
75 | |
76 | switch (tree->OperGet()) |
77 | { |
78 | default: |
79 | srcCount = BuildSimple(tree); |
80 | break; |
81 | |
82 | case GT_LCL_VAR: |
83 | // Because we do containment analysis before we redo dataflow and identify register |
84 | // candidates, the containment analysis only uses !lvDoNotEnregister to estimate register |
85 | // candidates. |
86 | // If there is a lclVar that is estimated to be register candidate but |
87 | // is not, if they were marked regOptional they should now be marked contained instead. |
88 | // TODO-XArch-CQ: When this is being called while RefPositions are being created, |
89 | // use lvLRACandidate here instead. |
90 | if (tree->IsRegOptional()) |
91 | { |
92 | if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked || |
93 | compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister) |
94 | { |
95 | tree->ClearRegOptional(); |
96 | tree->SetContained(); |
97 | INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 0)); |
98 | return 0; |
99 | } |
100 | } |
101 | __fallthrough; |
102 | |
103 | case GT_LCL_FLD: |
104 | { |
105 | // We handle tracked variables differently from non-tracked ones. If it is tracked, |
106 | // we will simply add a use of the tracked variable at its parent/consumer. |
107 | // Otherwise, for a use we need to actually add the appropriate references for loading |
108 | // or storing the variable. |
109 | // |
110 | // A tracked variable won't actually get used until the appropriate ancestor tree node |
111 | // is processed, unless this is marked "isLocalDefUse" because it is a stack-based argument |
112 | // to a call or an orphaned dead node. |
113 | // |
114 | LclVarDsc* const varDsc = &compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum]; |
115 | if (isCandidateVar(varDsc)) |
116 | { |
117 | INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 1)); |
118 | return 0; |
119 | } |
120 | srcCount = 0; |
121 | #ifdef FEATURE_SIMD |
122 | // Need an additional register to read upper 4 bytes of Vector3. |
123 | if (tree->TypeGet() == TYP_SIMD12) |
124 | { |
125 | // We need an internal register different from targetReg in which 'tree' produces its result |
126 | // because both targetReg and internal reg will be in use at the same time. |
127 | buildInternalFloatRegisterDefForNode(tree, allSIMDRegs()); |
128 | setInternalRegsDelayFree = true; |
129 | buildInternalRegisterUses(); |
130 | } |
131 | #endif |
132 | BuildDef(tree); |
133 | } |
134 | break; |
135 | |
136 | case GT_STORE_LCL_FLD: |
137 | case GT_STORE_LCL_VAR: |
138 | srcCount = BuildStoreLoc(tree->AsLclVarCommon()); |
139 | break; |
140 | |
141 | case GT_FIELD_LIST: |
142 | // These should always be contained. We don't correctly allocate or |
143 | // generate code for a non-contained GT_FIELD_LIST. |
144 | noway_assert(!"Non-contained GT_FIELD_LIST" ); |
145 | srcCount = 0; |
146 | break; |
147 | |
148 | case GT_LIST: |
149 | case GT_ARGPLACE: |
150 | case GT_NO_OP: |
151 | case GT_START_NONGC: |
152 | srcCount = 0; |
153 | assert(dstCount == 0); |
154 | break; |
155 | |
156 | case GT_PROF_HOOK: |
157 | srcCount = 0; |
158 | assert(dstCount == 0); |
159 | killMask = getKillSetForProfilerHook(); |
160 | BuildDefsWithKills(tree, 0, RBM_NONE, killMask); |
161 | break; |
162 | |
163 | case GT_CNS_INT: |
164 | case GT_CNS_LNG: |
165 | case GT_CNS_DBL: |
166 | { |
167 | srcCount = 0; |
168 | assert(dstCount == 1); |
169 | assert(!tree->IsReuseRegVal()); |
170 | RefPosition* def = BuildDef(tree); |
171 | def->getInterval()->isConstant = true; |
172 | } |
173 | break; |
174 | |
175 | #if !defined(_TARGET_64BIT_) |
176 | |
177 | case GT_LONG: |
178 | assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here. |
179 | // An unused GT_LONG node needs to consume its sources, but need not produce a register. |
180 | tree->gtType = TYP_VOID; |
181 | tree->ClearUnusedValue(); |
182 | isLocalDefUse = false; |
183 | srcCount = 2; |
184 | dstCount = 0; |
185 | BuildUse(tree->gtGetOp1()); |
186 | BuildUse(tree->gtGetOp2()); |
187 | break; |
188 | |
189 | #endif // !defined(_TARGET_64BIT_) |
190 | |
191 | case GT_BOX: |
192 | case GT_COMMA: |
193 | case GT_QMARK: |
194 | case GT_COLON: |
195 | srcCount = 0; |
196 | unreached(); |
197 | break; |
198 | |
199 | case GT_RETURN: |
200 | srcCount = BuildReturn(tree); |
201 | killMask = getKillSetForReturn(); |
202 | BuildDefsWithKills(tree, 0, RBM_NONE, killMask); |
203 | break; |
204 | |
205 | case GT_RETFILT: |
206 | assert(dstCount == 0); |
207 | if (tree->TypeGet() == TYP_VOID) |
208 | { |
209 | srcCount = 0; |
210 | } |
211 | else |
212 | { |
213 | assert(tree->TypeGet() == TYP_INT); |
214 | srcCount = 1; |
215 | BuildUse(tree->gtGetOp1(), RBM_INTRET); |
216 | } |
217 | break; |
218 | |
219 | // A GT_NOP is either a passthrough (if it is void, or if it has |
220 | // a child), but must be considered to produce a dummy value if it |
221 | // has a type but no child |
222 | case GT_NOP: |
223 | srcCount = 0; |
224 | assert((tree->gtGetOp1() == nullptr) || tree->isContained()); |
225 | if (tree->TypeGet() != TYP_VOID && tree->gtGetOp1() == nullptr) |
226 | { |
227 | assert(dstCount == 1); |
228 | BuildUse(tree->gtGetOp1()); |
229 | BuildDef(tree); |
230 | } |
231 | else |
232 | { |
233 | assert(dstCount == 0); |
234 | } |
235 | break; |
236 | |
237 | case GT_JTRUE: |
238 | { |
239 | srcCount = 0; |
240 | assert(dstCount == 0); |
241 | GenTree* cmp = tree->gtGetOp1(); |
242 | assert(!cmp->IsValue()); |
243 | } |
244 | break; |
245 | |
246 | case GT_JCC: |
247 | srcCount = 0; |
248 | assert(dstCount == 0); |
249 | break; |
250 | |
251 | case GT_SETCC: |
252 | srcCount = 0; |
253 | assert(dstCount == 1); |
254 | // This defines a byte value (note that on x64 allByteRegs() is defined as RBM_ALLINT). |
255 | BuildDef(tree, allByteRegs()); |
256 | break; |
257 | |
258 | case GT_JMP: |
259 | srcCount = 0; |
260 | assert(dstCount == 0); |
261 | break; |
262 | |
263 | case GT_SWITCH: |
264 | // This should never occur since switch nodes must not be visible at this |
265 | // point in the JIT. |
266 | srcCount = 0; |
267 | noway_assert(!"Switch must be lowered at this point" ); |
268 | break; |
269 | |
270 | case GT_JMPTABLE: |
271 | srcCount = 0; |
272 | assert(dstCount == 1); |
273 | BuildDef(tree); |
274 | break; |
275 | |
276 | case GT_SWITCH_TABLE: |
277 | { |
278 | assert(dstCount == 0); |
279 | buildInternalIntRegisterDefForNode(tree); |
280 | srcCount = BuildBinaryUses(tree->AsOp()); |
281 | buildInternalRegisterUses(); |
282 | assert(srcCount == 2); |
283 | } |
284 | break; |
285 | |
286 | case GT_ASG: |
287 | noway_assert(!"We should never hit any assignment operator in lowering" ); |
288 | srcCount = 0; |
289 | break; |
290 | |
291 | #if !defined(_TARGET_64BIT_) |
292 | case GT_ADD_LO: |
293 | case GT_ADD_HI: |
294 | case GT_SUB_LO: |
295 | case GT_SUB_HI: |
296 | #endif |
297 | case GT_ADD: |
298 | case GT_SUB: |
299 | case GT_AND: |
300 | case GT_OR: |
301 | case GT_XOR: |
302 | srcCount = BuildBinaryUses(tree->AsOp()); |
303 | assert(dstCount == 1); |
304 | BuildDef(tree); |
305 | break; |
306 | |
307 | case GT_BT: |
308 | srcCount = BuildBinaryUses(tree->AsOp()); |
309 | assert(dstCount == 0); |
310 | break; |
311 | |
312 | case GT_RETURNTRAP: |
313 | { |
314 | // This just turns into a compare of its child with an int + a conditional call. |
315 | RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree); |
316 | srcCount = BuildOperandUses(tree->gtGetOp1()); |
317 | buildInternalRegisterUses(); |
318 | killMask = compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC); |
319 | BuildDefsWithKills(tree, 0, RBM_NONE, killMask); |
320 | } |
321 | break; |
322 | |
323 | case GT_MOD: |
324 | case GT_DIV: |
325 | case GT_UMOD: |
326 | case GT_UDIV: |
327 | srcCount = BuildModDiv(tree->AsOp()); |
328 | break; |
329 | |
330 | #if defined(_TARGET_X86_) |
331 | case GT_MUL_LONG: |
332 | dstCount = 2; |
333 | __fallthrough; |
334 | #endif |
335 | case GT_MUL: |
336 | case GT_MULHI: |
337 | srcCount = BuildMul(tree->AsOp()); |
338 | break; |
339 | |
340 | case GT_INTRINSIC: |
341 | srcCount = BuildIntrinsic(tree->AsOp()); |
342 | break; |
343 | |
344 | #ifdef FEATURE_SIMD |
345 | case GT_SIMD: |
346 | srcCount = BuildSIMD(tree->AsSIMD()); |
347 | break; |
348 | #endif // FEATURE_SIMD |
349 | |
350 | #ifdef FEATURE_HW_INTRINSICS |
351 | case GT_HWIntrinsic: |
352 | srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic()); |
353 | break; |
354 | #endif // FEATURE_HW_INTRINSICS |
355 | |
356 | case GT_CAST: |
357 | assert(dstCount == 1); |
358 | srcCount = BuildCast(tree->AsCast()); |
359 | break; |
360 | |
361 | case GT_BITCAST: |
362 | { |
363 | assert(dstCount == 1); |
364 | tgtPrefUse = BuildUse(tree->gtGetOp1()); |
365 | BuildDef(tree); |
366 | srcCount = 1; |
367 | } |
368 | break; |
369 | |
370 | case GT_NEG: |
371 | // TODO-XArch-CQ: |
372 | // SSE instruction set doesn't have an instruction to negate a number. |
373 | // The recommended way is to xor the float/double number with a bitmask. |
374 | // The only way to xor is using xorps or xorpd both of which operate on |
375 | // 128-bit operands. To hold the bit-mask we would need another xmm |
376 | // register or a 16-byte aligned 128-bit data constant. Right now emitter |
377 | // lacks the support for emitting such constants or instruction with mem |
378 | // addressing mode referring to a 128-bit operand. For now we use an |
379 | // internal xmm register to load 32/64-bit bitmask from data section. |
380 | // Note that by trading additional data section memory (128-bit) we can |
381 | // save on the need for an internal register and also a memory-to-reg |
382 | // move. |
383 | // |
384 | // Note: another option to avoid internal register requirement is by |
385 | // lowering as GT_SUB(0, src). This will generate code different from |
386 | // Jit64 and could possibly result in compat issues (?). |
387 | if (varTypeIsFloating(tree)) |
388 | { |
389 | |
390 | RefPosition* internalDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates()); |
391 | srcCount = BuildOperandUses(tree->gtGetOp1()); |
392 | buildInternalRegisterUses(); |
393 | } |
394 | else |
395 | { |
396 | srcCount = BuildOperandUses(tree->gtGetOp1()); |
397 | } |
398 | BuildDef(tree); |
399 | break; |
400 | |
401 | case GT_NOT: |
402 | srcCount = BuildOperandUses(tree->gtGetOp1()); |
403 | BuildDef(tree); |
404 | break; |
405 | |
406 | case GT_LSH: |
407 | case GT_RSH: |
408 | case GT_RSZ: |
409 | case GT_ROL: |
410 | case GT_ROR: |
411 | #ifdef _TARGET_X86_ |
412 | case GT_LSH_HI: |
413 | case GT_RSH_LO: |
414 | #endif |
415 | srcCount = BuildShiftRotate(tree); |
416 | break; |
417 | |
418 | case GT_EQ: |
419 | case GT_NE: |
420 | case GT_LT: |
421 | case GT_LE: |
422 | case GT_GE: |
423 | case GT_GT: |
424 | case GT_TEST_EQ: |
425 | case GT_TEST_NE: |
426 | case GT_CMP: |
427 | srcCount = BuildCmp(tree); |
428 | break; |
429 | |
430 | case GT_CKFINITE: |
431 | { |
432 | assert(dstCount == 1); |
433 | RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree); |
434 | srcCount = BuildOperandUses(tree->gtGetOp1()); |
435 | buildInternalRegisterUses(); |
436 | BuildDef(tree); |
437 | } |
438 | break; |
439 | |
440 | case GT_CMPXCHG: |
441 | { |
442 | srcCount = 3; |
443 | assert(dstCount == 1); |
444 | |
445 | // Comparand is preferenced to RAX. |
446 | // The remaining two operands can be in any reg other than RAX. |
447 | BuildUse(tree->gtCmpXchg.gtOpLocation, allRegs(TYP_INT) & ~RBM_RAX); |
448 | BuildUse(tree->gtCmpXchg.gtOpValue, allRegs(TYP_INT) & ~RBM_RAX); |
449 | BuildUse(tree->gtCmpXchg.gtOpComparand, RBM_RAX); |
450 | BuildDef(tree, RBM_RAX); |
451 | } |
452 | break; |
453 | |
454 | case GT_XADD: |
455 | case GT_XCHG: |
456 | { |
457 | // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have |
458 | // to special case them. |
459 | // These tree nodes will have their op1 marked as isDelayFree=true. |
460 | // That is, op1's reg remains in use until the subsequent instruction. |
461 | GenTree* addr = tree->gtGetOp1(); |
462 | GenTree* data = tree->gtGetOp2(); |
463 | assert(!addr->isContained()); |
464 | RefPosition* addrUse = BuildUse(addr); |
465 | setDelayFree(addrUse); |
466 | tgtPrefUse = addrUse; |
467 | assert(!data->isContained()); |
468 | BuildUse(data); |
469 | srcCount = 2; |
470 | assert(dstCount == 1); |
471 | BuildDef(tree); |
472 | } |
473 | break; |
474 | |
475 | case GT_PUTARG_REG: |
476 | srcCount = BuildPutArgReg(tree->AsUnOp()); |
477 | break; |
478 | |
479 | case GT_CALL: |
480 | srcCount = BuildCall(tree->AsCall()); |
481 | if (tree->AsCall()->HasMultiRegRetVal()) |
482 | { |
483 | dstCount = tree->AsCall()->GetReturnTypeDesc()->GetReturnRegCount(); |
484 | } |
485 | break; |
486 | |
487 | case GT_ADDR: |
488 | { |
489 | // For a GT_ADDR, the child node should not be evaluated into a register |
490 | GenTree* child = tree->gtGetOp1(); |
491 | assert(!isCandidateLocalRef(child)); |
492 | assert(child->isContained()); |
493 | assert(dstCount == 1); |
494 | srcCount = 0; |
495 | } |
496 | break; |
497 | |
498 | #if !defined(FEATURE_PUT_STRUCT_ARG_STK) |
499 | case GT_OBJ: |
500 | #endif |
501 | case GT_BLK: |
502 | case GT_DYN_BLK: |
503 | // These should all be eliminated prior to Lowering. |
504 | assert(!"Non-store block node in Lowering" ); |
505 | srcCount = 0; |
506 | break; |
507 | |
508 | #ifdef FEATURE_PUT_STRUCT_ARG_STK |
509 | case GT_PUTARG_STK: |
510 | srcCount = BuildPutArgStk(tree->AsPutArgStk()); |
511 | break; |
512 | #endif // FEATURE_PUT_STRUCT_ARG_STK |
513 | |
514 | case GT_STORE_BLK: |
515 | case GT_STORE_OBJ: |
516 | case GT_STORE_DYN_BLK: |
517 | srcCount = BuildBlockStore(tree->AsBlk()); |
518 | break; |
519 | |
520 | case GT_INIT_VAL: |
521 | // Always a passthrough of its child's value. |
522 | assert(!"INIT_VAL should always be contained" ); |
523 | srcCount = 0; |
524 | break; |
525 | |
526 | case GT_LCLHEAP: |
527 | srcCount = BuildLclHeap(tree); |
528 | break; |
529 | |
530 | case GT_ARR_BOUNDS_CHECK: |
531 | #ifdef FEATURE_SIMD |
532 | case GT_SIMD_CHK: |
533 | #endif // FEATURE_SIMD |
534 | #ifdef FEATURE_HW_INTRINSICS |
535 | case GT_HW_INTRINSIC_CHK: |
536 | #endif // FEATURE_HW_INTRINSICS |
537 | |
538 | // Consumes arrLen & index - has no result |
539 | srcCount = 2; |
540 | assert(dstCount == 0); |
541 | srcCount = BuildOperandUses(tree->AsBoundsChk()->gtIndex); |
542 | srcCount += BuildOperandUses(tree->AsBoundsChk()->gtArrLen); |
543 | break; |
544 | |
545 | case GT_ARR_ELEM: |
546 | // These must have been lowered to GT_ARR_INDEX |
547 | noway_assert(!"We should never see a GT_ARR_ELEM after Lowering." ); |
548 | srcCount = 0; |
549 | break; |
550 | |
551 | case GT_ARR_INDEX: |
552 | { |
553 | srcCount = 2; |
554 | assert(dstCount == 1); |
555 | assert(!tree->AsArrIndex()->ArrObj()->isContained()); |
556 | assert(!tree->AsArrIndex()->IndexExpr()->isContained()); |
557 | // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple |
558 | // times while the result is being computed. |
559 | RefPosition* arrObjUse = BuildUse(tree->AsArrIndex()->ArrObj()); |
560 | setDelayFree(arrObjUse); |
561 | BuildUse(tree->AsArrIndex()->IndexExpr()); |
562 | BuildDef(tree); |
563 | } |
564 | break; |
565 | |
566 | case GT_ARR_OFFSET: |
567 | { |
568 | // This consumes the offset, if any, the arrObj and the effective index, |
569 | // and produces the flattened offset for this dimension. |
570 | assert(dstCount == 1); |
571 | srcCount = 0; |
572 | RefPosition* internalDef = nullptr; |
573 | if (tree->gtArrOffs.gtOffset->isContained()) |
574 | { |
575 | srcCount = 2; |
576 | } |
577 | else |
578 | { |
579 | // Here we simply need an internal register, which must be different |
580 | // from any of the operand's registers, but may be the same as targetReg. |
581 | srcCount = 3; |
582 | internalDef = buildInternalIntRegisterDefForNode(tree); |
583 | BuildUse(tree->AsArrOffs()->gtOffset); |
584 | } |
585 | BuildUse(tree->AsArrOffs()->gtIndex); |
586 | BuildUse(tree->AsArrOffs()->gtArrObj); |
587 | if (internalDef != nullptr) |
588 | { |
589 | buildInternalRegisterUses(); |
590 | } |
591 | BuildDef(tree); |
592 | } |
593 | break; |
594 | |
595 | case GT_LEA: |
596 | // The LEA usually passes its operands through to the GT_IND, in which case it will |
597 | // be contained, but we may be instantiating an address, in which case we set them here. |
598 | srcCount = 0; |
599 | assert(dstCount == 1); |
600 | if (tree->AsAddrMode()->HasBase()) |
601 | { |
602 | srcCount++; |
603 | BuildUse(tree->AsAddrMode()->Base()); |
604 | } |
605 | if (tree->AsAddrMode()->HasIndex()) |
606 | { |
607 | srcCount++; |
608 | BuildUse(tree->AsAddrMode()->Index()); |
609 | } |
610 | BuildDef(tree); |
611 | break; |
612 | |
613 | case GT_STOREIND: |
614 | if (compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(tree)) |
615 | { |
616 | srcCount = BuildGCWriteBarrier(tree); |
617 | break; |
618 | } |
619 | srcCount = BuildIndir(tree->AsIndir()); |
620 | break; |
621 | |
622 | case GT_NULLCHECK: |
623 | { |
624 | assert(dstCount == 0); |
625 | regMaskTP indirCandidates = RBM_NONE; |
626 | BuildUse(tree->gtGetOp1(), indirCandidates); |
627 | srcCount = 1; |
628 | break; |
629 | } |
630 | |
631 | case GT_IND: |
632 | srcCount = BuildIndir(tree->AsIndir()); |
633 | assert(dstCount == 1); |
634 | break; |
635 | |
636 | case GT_CATCH_ARG: |
637 | srcCount = 0; |
638 | assert(dstCount == 1); |
639 | BuildDef(tree, RBM_EXCEPTION_OBJECT); |
640 | break; |
641 | |
642 | #if !FEATURE_EH_FUNCLETS |
643 | case GT_END_LFIN: |
644 | srcCount = 0; |
645 | assert(dstCount == 0); |
646 | break; |
647 | #endif |
648 | |
649 | case GT_CLS_VAR: |
650 | // These nodes are eliminated by rationalizer. |
651 | JITDUMP("Unexpected node %s in Lower.\n" , GenTree::OpName(tree->OperGet())); |
652 | unreached(); |
653 | break; |
654 | |
655 | case GT_INDEX_ADDR: |
656 | { |
657 | assert(dstCount == 1); |
658 | RefPosition* internalDef = nullptr; |
659 | if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL) |
660 | { |
661 | internalDef = buildInternalIntRegisterDefForNode(tree); |
662 | } |
663 | else |
664 | { |
665 | switch (tree->AsIndexAddr()->gtElemSize) |
666 | { |
667 | case 1: |
668 | case 2: |
669 | case 4: |
670 | case 8: |
671 | break; |
672 | |
673 | default: |
674 | internalDef = buildInternalIntRegisterDefForNode(tree); |
675 | break; |
676 | } |
677 | } |
678 | srcCount = BuildBinaryUses(tree->AsOp()); |
679 | if (internalDef != nullptr) |
680 | { |
681 | buildInternalRegisterUses(); |
682 | } |
683 | BuildDef(tree); |
684 | } |
685 | break; |
686 | |
687 | } // end switch (tree->OperGet()) |
688 | |
689 | // We need to be sure that we've set srcCount and dstCount appropriately. |
690 | // Not that for XARCH, the maximum number of registers defined is 2. |
691 | assert((dstCount < 2) || ((dstCount == 2) && tree->IsMultiRegNode())); |
692 | assert(isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue())); |
693 | assert(!tree->IsUnusedValue() || (dstCount != 0)); |
694 | assert(dstCount == tree->GetRegisterDstCount()); |
695 | INDEBUG(dumpNodeInfo(tree, dstCandidates, srcCount, dstCount)); |
696 | return srcCount; |
697 | } |
698 | |
699 | GenTree* LinearScan::getTgtPrefOperand(GenTreeOp* tree) |
700 | { |
701 | // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1. |
702 | // Even then we would like to set isTgtPref on Op1. |
703 | if (tree->OperIsBinary() && isRMWRegOper(tree)) |
704 | { |
705 | GenTree* op1 = tree->gtGetOp1(); |
706 | GenTree* op2 = tree->gtGetOp2(); |
707 | |
708 | // Commutative opers like add/mul/and/or/xor could reverse the order of |
709 | // operands if it is safe to do so. In such a case we would like op2 to be |
710 | // target preferenced instead of op1. |
711 | if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr) |
712 | { |
713 | op1 = op2; |
714 | op2 = tree->gtGetOp1(); |
715 | } |
716 | |
717 | // If we have a read-modify-write operation, we want to preference op1 to the target, |
718 | // if it is not contained. |
719 | if (!op1->isContained() && !op1->OperIs(GT_LIST)) |
720 | { |
721 | return op1; |
722 | } |
723 | } |
724 | return nullptr; |
725 | } |
726 | |
727 | //------------------------------------------------------------------------------ |
728 | // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format |
729 | // |
730 | // Arguments: |
731 | // tree - a binary tree node |
732 | // |
733 | // Return Value: |
734 | // Returns true if we can use the read-modify-write instruction form |
735 | // |
736 | // Notes: |
737 | // This is used to determine whether to preference the source to the destination register. |
738 | // |
739 | bool LinearScan::isRMWRegOper(GenTree* tree) |
740 | { |
741 | // TODO-XArch-CQ: Make this more accurate. |
742 | // For now, We assume that most binary operators are of the RMW form. |
743 | assert(tree->OperIsBinary()); |
744 | |
745 | if (tree->OperIsCompare() || tree->OperIs(GT_CMP) || tree->OperIs(GT_BT)) |
746 | { |
747 | return false; |
748 | } |
749 | |
750 | switch (tree->OperGet()) |
751 | { |
752 | // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand |
753 | case GT_LEA: |
754 | case GT_STOREIND: |
755 | case GT_ARR_INDEX: |
756 | case GT_STORE_BLK: |
757 | case GT_STORE_OBJ: |
758 | case GT_SWITCH_TABLE: |
759 | case GT_LOCKADD: |
760 | #ifdef _TARGET_X86_ |
761 | case GT_LONG: |
762 | #endif |
763 | return false; |
764 | |
765 | // x86/x64 does support a three op multiply when op2|op1 is a contained immediate |
766 | case GT_MUL: |
767 | return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed()); |
768 | |
769 | #ifdef FEATURE_HW_INTRINSICS |
770 | case GT_HWIntrinsic: |
771 | return tree->isRMWHWIntrinsic(compiler); |
772 | #endif // FEATURE_HW_INTRINSICS |
773 | |
774 | default: |
775 | return true; |
776 | } |
777 | } |
778 | |
779 | // Support for building RefPositions for RMW nodes. |
780 | int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates) |
781 | { |
782 | int srcCount = 0; |
783 | GenTree* op1 = node->gtOp1; |
784 | GenTree* op2 = node->gtGetOp2IfPresent(); |
785 | bool isReverseOp = node->IsReverseOp(); |
786 | regMaskTP op1Candidates = candidates; |
787 | regMaskTP op2Candidates = candidates; |
788 | |
789 | #ifdef _TARGET_X86_ |
790 | if (varTypeIsByte(node)) |
791 | { |
792 | regMaskTP byteCandidates = (candidates == RBM_NONE) ? allByteRegs() : (candidates & allByteRegs()); |
793 | if (!op1->isContained()) |
794 | { |
795 | assert(byteCandidates != RBM_NONE); |
796 | op1Candidates = byteCandidates; |
797 | } |
798 | if (node->OperIsCommutative() && !op2->isContained()) |
799 | { |
800 | assert(byteCandidates != RBM_NONE); |
801 | op2Candidates = byteCandidates; |
802 | } |
803 | } |
804 | #endif // _TARGET_X86_ |
805 | |
806 | GenTree* tgtPrefOperand = getTgtPrefOperand(node); |
807 | assert((tgtPrefOperand == nullptr) || (tgtPrefOperand == op1) || node->OperIsCommutative()); |
808 | assert(!isReverseOp || node->OperIsCommutative()); |
809 | |
810 | // Determine which operand, if any, should be delayRegFree. Normally, this would be op2, |
811 | // but if we have a commutative operator and op1 is a contained memory op, it would be op1. |
812 | // We need to make the delayRegFree operand remain live until the op is complete, by marking |
813 | // the source(s) associated with op2 as "delayFree". |
814 | // Note that if op2 of a binary RMW operator is a memory op, even if the operator |
815 | // is commutative, codegen cannot reverse them. |
816 | // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's |
817 | // more work to be done to correctly reverse the operands if they involve memory |
818 | // operands. Also, we may need to handle more cases than GT_IND, especially once |
819 | // we've modified the register allocator to not require all nodes to be assigned |
820 | // a register (e.g. a spilled lclVar can often be referenced directly from memory). |
821 | // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op. |
822 | GenTree* delayUseOperand = op2; |
823 | if (node->OperIsCommutative()) |
824 | { |
825 | if (op1->isContained() && op2 != nullptr) |
826 | { |
827 | delayUseOperand = op1; |
828 | } |
829 | else if (!op2->isContained() || op2->IsCnsIntOrI()) |
830 | { |
831 | // If we have a commutative operator and op2 is not a memory op, we don't need |
832 | // to set delayRegFree on either operand because codegen can swap them. |
833 | delayUseOperand = nullptr; |
834 | } |
835 | } |
836 | else if (op1->isContained()) |
837 | { |
838 | delayUseOperand = nullptr; |
839 | } |
840 | if (delayUseOperand != nullptr) |
841 | { |
842 | assert(delayUseOperand != tgtPrefOperand); |
843 | } |
844 | |
845 | if (isReverseOp) |
846 | { |
847 | op1 = op2; |
848 | op2 = node->gtOp1; |
849 | } |
850 | |
851 | // Build first use |
852 | if (tgtPrefOperand == op1) |
853 | { |
854 | assert(!op1->isContained()); |
855 | tgtPrefUse = BuildUse(op1, op1Candidates); |
856 | srcCount++; |
857 | } |
858 | else if (delayUseOperand == op1) |
859 | { |
860 | srcCount += BuildDelayFreeUses(op1, op1Candidates); |
861 | } |
862 | else |
863 | { |
864 | srcCount += BuildOperandUses(op1, op1Candidates); |
865 | } |
866 | // Build second use |
867 | if (op2 != nullptr) |
868 | { |
869 | if (tgtPrefOperand == op2) |
870 | { |
871 | assert(!op2->isContained()); |
872 | tgtPrefUse = BuildUse(op2, op2Candidates); |
873 | srcCount++; |
874 | } |
875 | else if (delayUseOperand == op2) |
876 | { |
877 | srcCount += BuildDelayFreeUses(op2, op2Candidates); |
878 | } |
879 | else |
880 | { |
881 | srcCount += BuildOperandUses(op2, op2Candidates); |
882 | } |
883 | } |
884 | return srcCount; |
885 | } |
886 | |
887 | //------------------------------------------------------------------------ |
888 | // BuildShiftRotate: Set the NodeInfo for a shift or rotate. |
889 | // |
890 | // Arguments: |
891 | // tree - The node of interest |
892 | // |
893 | // Return Value: |
894 | // The number of sources consumed by this node. |
895 | // |
896 | int LinearScan::BuildShiftRotate(GenTree* tree) |
897 | { |
898 | // For shift operations, we need that the number |
899 | // of bits moved gets stored in CL in case |
900 | // the number of bits to shift is not a constant. |
901 | int srcCount = 0; |
902 | GenTree* shiftBy = tree->gtGetOp2(); |
903 | GenTree* source = tree->gtGetOp1(); |
904 | regMaskTP srcCandidates = RBM_NONE; |
905 | regMaskTP dstCandidates = RBM_NONE; |
906 | |
907 | // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off) |
908 | // We will allow whatever can be encoded - hope you know what you are doing. |
909 | if (shiftBy->isContained()) |
910 | { |
911 | assert(shiftBy->OperIsConst()); |
912 | } |
913 | else |
914 | { |
915 | srcCandidates = allRegs(TYP_INT) & ~RBM_RCX; |
916 | dstCandidates = allRegs(TYP_INT) & ~RBM_RCX; |
917 | } |
918 | |
919 | // Note that Rotate Left/Right instructions don't set ZF and SF flags. |
920 | // |
921 | // If the operand being shifted is 32-bits then upper three bits are masked |
922 | // by hardware to get actual shift count. Similarly for 64-bit operands |
923 | // shift count is narrowed to [0..63]. If the resulting shift count is zero, |
924 | // then shift operation won't modify flags. |
925 | // |
926 | // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0) |
927 | // if the shift count is known to be non-zero and in the range depending on the |
928 | // operand size. |
929 | CLANG_FORMAT_COMMENT_ANCHOR; |
930 | |
931 | #ifdef _TARGET_X86_ |
932 | // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that |
933 | // we can have a three operand form. |
934 | if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO) |
935 | { |
936 | assert((source->OperGet() == GT_LONG) && source->isContained()); |
937 | |
938 | GenTree* sourceLo = source->gtGetOp1(); |
939 | GenTree* sourceHi = source->gtGetOp2(); |
940 | assert(!sourceLo->isContained() && !sourceHi->isContained()); |
941 | RefPosition* sourceLoUse = BuildUse(sourceLo, srcCandidates); |
942 | RefPosition* sourceHiUse = BuildUse(sourceHi, srcCandidates); |
943 | |
944 | if (!tree->isContained()) |
945 | { |
946 | if (tree->OperGet() == GT_LSH_HI) |
947 | { |
948 | setDelayFree(sourceLoUse); |
949 | } |
950 | else |
951 | { |
952 | setDelayFree(sourceHiUse); |
953 | } |
954 | } |
955 | } |
956 | else |
957 | #endif |
958 | if (!source->isContained()) |
959 | { |
960 | tgtPrefUse = BuildUse(source, srcCandidates); |
961 | srcCount++; |
962 | } |
963 | else |
964 | { |
965 | srcCount += BuildOperandUses(source, srcCandidates); |
966 | } |
967 | if (!tree->isContained()) |
968 | { |
969 | if (!shiftBy->isContained()) |
970 | { |
971 | srcCount += BuildDelayFreeUses(shiftBy, RBM_RCX); |
972 | buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX); |
973 | } |
974 | BuildDef(tree, dstCandidates); |
975 | } |
976 | else |
977 | { |
978 | if (!shiftBy->isContained()) |
979 | { |
980 | srcCount += BuildOperandUses(shiftBy, RBM_RCX); |
981 | buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX); |
982 | } |
983 | } |
984 | return srcCount; |
985 | } |
986 | |
987 | //------------------------------------------------------------------------ |
988 | // BuildCall: Set the NodeInfo for a call. |
989 | // |
990 | // Arguments: |
991 | // call - The call node of interest |
992 | // |
993 | // Return Value: |
994 | // The number of sources consumed by this node. |
995 | // |
996 | int LinearScan::BuildCall(GenTreeCall* call) |
997 | { |
998 | bool hasMultiRegRetVal = false; |
999 | ReturnTypeDesc* retTypeDesc = nullptr; |
1000 | int srcCount = 0; |
1001 | int dstCount = 0; |
1002 | regMaskTP dstCandidates = RBM_NONE; |
1003 | |
1004 | assert(!call->isContained()); |
1005 | if (call->TypeGet() != TYP_VOID) |
1006 | { |
1007 | hasMultiRegRetVal = call->HasMultiRegRetVal(); |
1008 | if (hasMultiRegRetVal) |
1009 | { |
1010 | // dst count = number of registers in which the value is returned by call |
1011 | retTypeDesc = call->GetReturnTypeDesc(); |
1012 | dstCount = retTypeDesc->GetReturnRegCount(); |
1013 | } |
1014 | else |
1015 | { |
1016 | dstCount = 1; |
1017 | } |
1018 | } |
1019 | |
1020 | GenTree* ctrlExpr = call->gtControlExpr; |
1021 | if (call->gtCallType == CT_INDIRECT) |
1022 | { |
1023 | ctrlExpr = call->gtCallAddr; |
1024 | } |
1025 | |
1026 | RegisterType registerType = call->TypeGet(); |
1027 | |
1028 | // Set destination candidates for return value of the call. |
1029 | CLANG_FORMAT_COMMENT_ANCHOR; |
1030 | |
1031 | #ifdef _TARGET_X86_ |
1032 | if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME)) |
1033 | { |
1034 | // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with |
1035 | // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the |
1036 | // correct argument registers. |
1037 | dstCandidates = RBM_PINVOKE_TCB; |
1038 | } |
1039 | else |
1040 | #endif // _TARGET_X86_ |
1041 | if (hasMultiRegRetVal) |
1042 | { |
1043 | assert(retTypeDesc != nullptr); |
1044 | dstCandidates = retTypeDesc->GetABIReturnRegs(); |
1045 | assert((int)genCountBits(dstCandidates) == dstCount); |
1046 | } |
1047 | else if (varTypeIsFloating(registerType)) |
1048 | { |
1049 | #ifdef _TARGET_X86_ |
1050 | // The return value will be on the X87 stack, and we will need to move it. |
1051 | dstCandidates = allRegs(registerType); |
1052 | #else // !_TARGET_X86_ |
1053 | dstCandidates = RBM_FLOATRET; |
1054 | #endif // !_TARGET_X86_ |
1055 | } |
1056 | else if (registerType == TYP_LONG) |
1057 | { |
1058 | dstCandidates = RBM_LNGRET; |
1059 | } |
1060 | else |
1061 | { |
1062 | dstCandidates = RBM_INTRET; |
1063 | } |
1064 | |
1065 | // number of args to a call = |
1066 | // callRegArgs + (callargs - placeholders, setup, etc) |
1067 | // there is an explicit thisPtr but it is redundant |
1068 | |
1069 | bool callHasFloatRegArgs = false; |
1070 | bool isVarArgs = call->IsVarargs(); |
1071 | |
1072 | // First, determine internal registers. |
1073 | // We will need one for any float arguments to a varArgs call. |
1074 | for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext()) |
1075 | { |
1076 | GenTree* argNode = list->Current(); |
1077 | if (argNode->OperIsPutArgReg()) |
1078 | { |
1079 | HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs); |
1080 | } |
1081 | else if (argNode->OperGet() == GT_FIELD_LIST) |
1082 | { |
1083 | for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest()) |
1084 | { |
1085 | assert(entry->Current()->OperIsPutArgReg()); |
1086 | HandleFloatVarArgs(call, entry->Current(), &callHasFloatRegArgs); |
1087 | } |
1088 | } |
1089 | } |
1090 | |
1091 | // Now, count reg args |
1092 | for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext()) |
1093 | { |
1094 | // By this point, lowering has ensured that all call arguments are one of the following: |
1095 | // - an arg setup store |
1096 | // - an arg placeholder |
1097 | // - a nop |
1098 | // - a copy blk |
1099 | // - a field list |
1100 | // - a put arg |
1101 | // |
1102 | // Note that this property is statically checked by LinearScan::CheckBlock. |
1103 | GenTree* argNode = list->Current(); |
1104 | |
1105 | // Each register argument corresponds to one source. |
1106 | if (argNode->OperIsPutArgReg()) |
1107 | { |
1108 | srcCount++; |
1109 | BuildUse(argNode, genRegMask(argNode->gtRegNum)); |
1110 | } |
1111 | #ifdef UNIX_AMD64_ABI |
1112 | else if (argNode->OperGet() == GT_FIELD_LIST) |
1113 | { |
1114 | for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest()) |
1115 | { |
1116 | assert(entry->Current()->OperIsPutArgReg()); |
1117 | srcCount++; |
1118 | BuildUse(entry->Current(), genRegMask(entry->Current()->gtRegNum)); |
1119 | } |
1120 | } |
1121 | #endif // UNIX_AMD64_ABI |
1122 | |
1123 | #ifdef DEBUG |
1124 | // In DEBUG only, check validity with respect to the arg table entry. |
1125 | |
1126 | fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode); |
1127 | assert(curArgTabEntry); |
1128 | |
1129 | if (curArgTabEntry->regNum == REG_STK) |
1130 | { |
1131 | // late arg that is not passed in a register |
1132 | assert(argNode->gtOper == GT_PUTARG_STK); |
1133 | |
1134 | #ifdef FEATURE_PUT_STRUCT_ARG_STK |
1135 | // If the node is TYP_STRUCT and it is put on stack with |
1136 | // putarg_stk operation, we consume and produce no registers. |
1137 | // In this case the embedded Obj node should not produce |
1138 | // registers too since it is contained. |
1139 | // Note that if it is a SIMD type the argument will be in a register. |
1140 | if (argNode->TypeGet() == TYP_STRUCT) |
1141 | { |
1142 | assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ); |
1143 | assert(argNode->gtGetOp1()->isContained()); |
1144 | } |
1145 | #endif // FEATURE_PUT_STRUCT_ARG_STK |
1146 | continue; |
1147 | } |
1148 | #ifdef UNIX_AMD64_ABI |
1149 | if (argNode->OperGet() == GT_FIELD_LIST) |
1150 | { |
1151 | assert(argNode->isContained()); |
1152 | assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct); |
1153 | |
1154 | int i = 0; |
1155 | for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest()) |
1156 | { |
1157 | const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum; |
1158 | assert(entry->Current()->gtRegNum == argReg); |
1159 | assert(i < 2); |
1160 | i++; |
1161 | } |
1162 | } |
1163 | else |
1164 | #endif // UNIX_AMD64_ABI |
1165 | { |
1166 | const regNumber argReg = curArgTabEntry->regNum; |
1167 | assert(argNode->gtRegNum == argReg); |
1168 | } |
1169 | #endif // DEBUG |
1170 | } |
1171 | |
1172 | // Now, count stack args |
1173 | // Note that these need to be computed into a register, but then |
1174 | // they're just stored to the stack - so the reg doesn't |
1175 | // need to remain live until the call. In fact, it must not |
1176 | // because the code generator doesn't actually consider it live, |
1177 | // so it can't be spilled. |
1178 | |
1179 | GenTree* args = call->gtCallArgs; |
1180 | while (args) |
1181 | { |
1182 | GenTree* arg = args->gtGetOp1(); |
1183 | if (!(arg->gtFlags & GTF_LATE_ARG) && !arg) |
1184 | { |
1185 | if (arg->IsValue() && !arg->isContained()) |
1186 | { |
1187 | assert(arg->IsUnusedValue()); |
1188 | } |
1189 | } |
1190 | args = args->gtGetOp2(); |
1191 | } |
1192 | |
1193 | // set reg requirements on call target represented as control sequence. |
1194 | if (ctrlExpr != nullptr) |
1195 | { |
1196 | regMaskTP ctrlExprCandidates = RBM_NONE; |
1197 | |
1198 | // In case of fast tail implemented as jmp, make sure that gtControlExpr is |
1199 | // computed into a register. |
1200 | if (call->IsFastTailCall()) |
1201 | { |
1202 | assert(!ctrlExpr->isContained()); |
1203 | // Fast tail call - make sure that call target is always computed in RAX |
1204 | // so that epilog sequence can generate "jmp rax" to achieve fast tail call. |
1205 | ctrlExprCandidates = RBM_RAX; |
1206 | } |
1207 | #ifdef _TARGET_X86_ |
1208 | else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT)) |
1209 | { |
1210 | // On x86, we need to generate a very specific pattern for indirect VSD calls: |
1211 | // |
1212 | // 3-byte nop |
1213 | // call dword ptr [eax] |
1214 | // |
1215 | // Where EAX is also used as an argument to the stub dispatch helper. Make |
1216 | // sure that the call target address is computed into EAX in this case. |
1217 | assert(ctrlExpr->isIndir() && ctrlExpr->isContained()); |
1218 | ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET; |
1219 | } |
1220 | #endif // _TARGET_X86_ |
1221 | |
1222 | #if FEATURE_VARARG |
1223 | // If it is a fast tail call, it is already preferenced to use RAX. |
1224 | // Therefore, no need set src candidates on call tgt again. |
1225 | if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall()) |
1226 | { |
1227 | // Don't assign the call target to any of the argument registers because |
1228 | // we will use them to also pass floating point arguments as required |
1229 | // by Amd64 ABI. |
1230 | ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS); |
1231 | } |
1232 | #endif // !FEATURE_VARARG |
1233 | srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates); |
1234 | } |
1235 | |
1236 | buildInternalRegisterUses(); |
1237 | |
1238 | // Now generate defs and kills. |
1239 | regMaskTP killMask = getKillSetForCall(call); |
1240 | BuildDefsWithKills(call, dstCount, dstCandidates, killMask); |
1241 | return srcCount; |
1242 | } |
1243 | |
1244 | //------------------------------------------------------------------------ |
1245 | // BuildBlockStore: Set the NodeInfo for a block store. |
1246 | // |
1247 | // Arguments: |
1248 | // blkNode - The block store node of interest |
1249 | // |
1250 | // Return Value: |
1251 | // The number of sources consumed by this node. |
1252 | // |
1253 | int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) |
1254 | { |
1255 | GenTree* dstAddr = blkNode->Addr(); |
1256 | unsigned size = blkNode->gtBlkSize; |
1257 | GenTree* source = blkNode->Data(); |
1258 | int srcCount = 0; |
1259 | |
1260 | GenTree* srcAddrOrFill = nullptr; |
1261 | bool isInitBlk = blkNode->OperIsInitBlkOp(); |
1262 | |
1263 | regMaskTP dstAddrRegMask = RBM_NONE; |
1264 | regMaskTP sourceRegMask = RBM_NONE; |
1265 | regMaskTP blkSizeRegMask = RBM_NONE; |
1266 | |
1267 | if (isInitBlk) |
1268 | { |
1269 | GenTree* initVal = source; |
1270 | if (initVal->OperIsInitVal()) |
1271 | { |
1272 | assert(initVal->isContained()); |
1273 | initVal = initVal->gtGetOp1(); |
1274 | } |
1275 | srcAddrOrFill = initVal; |
1276 | |
1277 | switch (blkNode->gtBlkOpKind) |
1278 | { |
1279 | case GenTreeBlk::BlkOpKindUnroll: |
1280 | assert(initVal->IsCnsIntOrI()); |
1281 | if (size >= XMM_REGSIZE_BYTES) |
1282 | { |
1283 | // Reserve an XMM register to fill it with a pack of 16 init value constants. |
1284 | buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); |
1285 | // use XMM register to fill with constants, it's AVX instruction and set the flag |
1286 | SetContainsAVXFlags(); |
1287 | } |
1288 | #ifdef _TARGET_X86_ |
1289 | if ((size & 1) != 0) |
1290 | { |
1291 | // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing |
1292 | // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this |
1293 | // when unrolling, so only allow byteable registers as the source value. (We could |
1294 | // consider just using BlkOpKindRepInstr instead.) |
1295 | sourceRegMask = allByteRegs(); |
1296 | } |
1297 | #endif // _TARGET_X86_ |
1298 | break; |
1299 | |
1300 | case GenTreeBlk::BlkOpKindRepInstr: |
1301 | // rep stos has the following register requirements: |
1302 | // a) The memory address to be in RDI. |
1303 | // b) The fill value has to be in RAX. |
1304 | // c) The buffer size will go in RCX. |
1305 | dstAddrRegMask = RBM_RDI; |
1306 | sourceRegMask = RBM_RAX; |
1307 | blkSizeRegMask = RBM_RCX; |
1308 | break; |
1309 | |
1310 | case GenTreeBlk::BlkOpKindHelper: |
1311 | #ifdef _TARGET_AMD64_ |
1312 | // The helper follows the regular AMD64 ABI. |
1313 | dstAddrRegMask = RBM_ARG_0; |
1314 | sourceRegMask = RBM_ARG_1; |
1315 | blkSizeRegMask = RBM_ARG_2; |
1316 | #else // !_TARGET_AMD64_ |
1317 | dstAddrRegMask = RBM_RDI; |
1318 | sourceRegMask = RBM_RAX; |
1319 | blkSizeRegMask = RBM_RCX; |
1320 | #endif // !_TARGET_AMD64_ |
1321 | break; |
1322 | |
1323 | default: |
1324 | unreached(); |
1325 | } |
1326 | } |
1327 | else |
1328 | { |
1329 | // CopyObj or CopyBlk |
1330 | if (source->gtOper == GT_IND) |
1331 | { |
1332 | assert(source->isContained()); |
1333 | srcAddrOrFill = source->gtGetOp1(); |
1334 | } |
1335 | if (blkNode->OperGet() == GT_STORE_OBJ) |
1336 | { |
1337 | if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr) |
1338 | { |
1339 | // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq. |
1340 | blkSizeRegMask = RBM_RCX; |
1341 | } |
1342 | // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its |
1343 | // sources. |
1344 | sourceRegMask = RBM_RSI; |
1345 | dstAddrRegMask = RBM_RDI; |
1346 | } |
1347 | else |
1348 | { |
1349 | switch (blkNode->gtBlkOpKind) |
1350 | { |
1351 | case GenTreeBlk::BlkOpKindUnroll: |
1352 | // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. |
1353 | // |
1354 | // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte. |
1355 | // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude |
1356 | // RBM_NON_BYTE_REGS from internal candidates. |
1357 | if ((size & (XMM_REGSIZE_BYTES - 1)) != 0) |
1358 | { |
1359 | regMaskTP regMask = allRegs(TYP_INT); |
1360 | |
1361 | #ifdef _TARGET_X86_ |
1362 | if ((size & 1) != 0) |
1363 | { |
1364 | regMask &= ~RBM_NON_BYTE_REGS; |
1365 | } |
1366 | #endif |
1367 | buildInternalIntRegisterDefForNode(blkNode, regMask); |
1368 | } |
1369 | |
1370 | if (size >= XMM_REGSIZE_BYTES) |
1371 | { |
1372 | // If we have a buffer larger than XMM_REGSIZE_BYTES, |
1373 | // reserve an XMM register to use it for a |
1374 | // series of 16-byte loads and stores. |
1375 | buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); |
1376 | // Uses XMM reg for load and store and hence check to see whether AVX instructions |
1377 | // are used for codegen, set ContainsAVX flag |
1378 | SetContainsAVXFlags(); |
1379 | } |
1380 | break; |
1381 | |
1382 | case GenTreeBlk::BlkOpKindRepInstr: |
1383 | // rep stos has the following register requirements: |
1384 | // a) The dest address has to be in RDI. |
1385 | // b) The src address has to be in RSI. |
1386 | // c) The buffer size will go in RCX. |
1387 | dstAddrRegMask = RBM_RDI; |
1388 | sourceRegMask = RBM_RSI; |
1389 | blkSizeRegMask = RBM_RCX; |
1390 | break; |
1391 | |
1392 | case GenTreeBlk::BlkOpKindHelper: |
1393 | #ifdef _TARGET_AMD64_ |
1394 | // The helper follows the regular AMD64 ABI. |
1395 | dstAddrRegMask = RBM_ARG_0; |
1396 | sourceRegMask = RBM_ARG_1; |
1397 | blkSizeRegMask = RBM_ARG_2; |
1398 | #else // !_TARGET_AMD64_ |
1399 | dstAddrRegMask = RBM_RDI; |
1400 | sourceRegMask = RBM_RAX; |
1401 | blkSizeRegMask = RBM_RCX; |
1402 | #endif // !_TARGET_AMD64_ |
1403 | break; |
1404 | |
1405 | default: |
1406 | unreached(); |
1407 | } |
1408 | } |
1409 | if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE)) |
1410 | { |
1411 | // This is a local source; we'll use a temp register for its address. |
1412 | assert(source->isContained() && source->OperIsLocal()); |
1413 | buildInternalIntRegisterDefForNode(blkNode, sourceRegMask); |
1414 | } |
1415 | } |
1416 | |
1417 | if ((size != 0) && (blkSizeRegMask != RBM_NONE)) |
1418 | { |
1419 | // Reserve a temp register for the block size argument. |
1420 | buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask); |
1421 | } |
1422 | |
1423 | if (!dstAddr->isContained() && !blkNode->IsReverseOp()) |
1424 | { |
1425 | srcCount++; |
1426 | BuildUse(dstAddr, dstAddrRegMask); |
1427 | } |
1428 | if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained()) |
1429 | { |
1430 | srcCount++; |
1431 | BuildUse(srcAddrOrFill, sourceRegMask); |
1432 | } |
1433 | if (!dstAddr->isContained() && blkNode->IsReverseOp()) |
1434 | { |
1435 | srcCount++; |
1436 | BuildUse(dstAddr, dstAddrRegMask); |
1437 | } |
1438 | |
1439 | if (size == 0) |
1440 | { |
1441 | assert(blkNode->OperIs(GT_STORE_DYN_BLK)); |
1442 | // The block size argument is a third argument to GT_STORE_DYN_BLK |
1443 | srcCount++; |
1444 | GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize; |
1445 | BuildUse(blockSize, blkSizeRegMask); |
1446 | } |
1447 | buildInternalRegisterUses(); |
1448 | regMaskTP killMask = getKillSetForBlockStore(blkNode); |
1449 | BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask); |
1450 | return srcCount; |
1451 | } |
1452 | |
1453 | #ifdef FEATURE_PUT_STRUCT_ARG_STK |
1454 | //------------------------------------------------------------------------ |
1455 | // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK. |
1456 | // |
1457 | // Arguments: |
1458 | // tree - The node of interest |
1459 | // |
1460 | // Return Value: |
1461 | // The number of sources consumed by this node. |
1462 | // |
1463 | int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) |
1464 | { |
1465 | int srcCount = 0; |
1466 | if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST) |
1467 | { |
1468 | assert(putArgStk->gtOp1->isContained()); |
1469 | |
1470 | RefPosition* simdTemp = nullptr; |
1471 | RefPosition* intTemp = nullptr; |
1472 | unsigned prevOffset = putArgStk->getArgSize(); |
1473 | // We need to iterate over the fields twice; once to determine the need for internal temps, |
1474 | // and once to actually build the uses. |
1475 | for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest()) |
1476 | { |
1477 | GenTree* const fieldNode = current->Current(); |
1478 | const var_types fieldType = fieldNode->TypeGet(); |
1479 | const unsigned fieldOffset = current->gtFieldOffset; |
1480 | |
1481 | #ifdef _TARGET_X86_ |
1482 | assert(fieldType != TYP_LONG); |
1483 | #endif // _TARGET_X86_ |
1484 | |
1485 | #if defined(FEATURE_SIMD) |
1486 | // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the |
1487 | // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where |
1488 | // we "round up" to 16. |
1489 | if ((current->gtFieldType == TYP_SIMD12) && (simdTemp == nullptr)) |
1490 | { |
1491 | simdTemp = buildInternalFloatRegisterDefForNode(putArgStk); |
1492 | } |
1493 | #endif // defined(FEATURE_SIMD) |
1494 | |
1495 | #ifdef _TARGET_X86_ |
1496 | if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push) |
1497 | { |
1498 | // We can treat as a slot any field that is stored at a slot boundary, where the previous |
1499 | // field is not in the same slot. (Note that we store the fields in reverse order.) |
1500 | const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); |
1501 | if (intTemp == nullptr) |
1502 | { |
1503 | intTemp = buildInternalIntRegisterDefForNode(putArgStk); |
1504 | } |
1505 | if (!fieldIsSlot && varTypeIsByte(fieldType)) |
1506 | { |
1507 | // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes |
1508 | // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will |
1509 | // need a byte-addressable register for the store. We will enforce this requirement on an internal |
1510 | // register, which we can use to copy multiple byte values. |
1511 | intTemp->registerAssignment &= allByteRegs(); |
1512 | } |
1513 | } |
1514 | #endif // _TARGET_X86_ |
1515 | |
1516 | if (varTypeIsGC(fieldType)) |
1517 | { |
1518 | putArgStk->gtNumberReferenceSlots++; |
1519 | } |
1520 | prevOffset = fieldOffset; |
1521 | } |
1522 | |
1523 | for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest()) |
1524 | { |
1525 | GenTree* const fieldNode = current->Current(); |
1526 | if (!fieldNode->isContained()) |
1527 | { |
1528 | BuildUse(fieldNode); |
1529 | srcCount++; |
1530 | } |
1531 | } |
1532 | buildInternalRegisterUses(); |
1533 | |
1534 | return srcCount; |
1535 | } |
1536 | |
1537 | GenTree* src = putArgStk->gtOp1; |
1538 | var_types type = src->TypeGet(); |
1539 | |
1540 | #if defined(FEATURE_SIMD) && defined(_TARGET_X86_) |
1541 | // For PutArgStk of a TYP_SIMD12, we need an extra register. |
1542 | if (putArgStk->isSIMD12()) |
1543 | { |
1544 | buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); |
1545 | BuildUse(putArgStk->gtOp1); |
1546 | srcCount = 1; |
1547 | buildInternalRegisterUses(); |
1548 | return srcCount; |
1549 | } |
1550 | #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_) |
1551 | |
1552 | if (type != TYP_STRUCT) |
1553 | { |
1554 | return BuildSimple(putArgStk); |
1555 | } |
1556 | |
1557 | GenTree* dst = putArgStk; |
1558 | GenTree* srcAddr = nullptr; |
1559 | |
1560 | // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. |
1561 | // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of |
1562 | // our framework assemblies, so this is the main code generation scheme we'll use. |
1563 | ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE; |
1564 | switch (putArgStk->gtPutArgStkKind) |
1565 | { |
1566 | case GenTreePutArgStk::Kind::Push: |
1567 | case GenTreePutArgStk::Kind::PushAllSlots: |
1568 | case GenTreePutArgStk::Kind::Unroll: |
1569 | // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. |
1570 | // |
1571 | // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte. |
1572 | // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude |
1573 | // RBM_NON_BYTE_REGS from internal candidates. |
1574 | if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0) |
1575 | { |
1576 | regMaskTP regMask = allRegs(TYP_INT); |
1577 | |
1578 | #ifdef _TARGET_X86_ |
1579 | if ((size % 2) != 0) |
1580 | { |
1581 | regMask &= ~RBM_NON_BYTE_REGS; |
1582 | } |
1583 | #endif |
1584 | buildInternalIntRegisterDefForNode(putArgStk, regMask); |
1585 | } |
1586 | |
1587 | #ifdef _TARGET_X86_ |
1588 | if (size >= 8) |
1589 | #else // !_TARGET_X86_ |
1590 | if (size >= XMM_REGSIZE_BYTES) |
1591 | #endif // !_TARGET_X86_ |
1592 | { |
1593 | // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux, |
1594 | // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a |
1595 | // series of 16-byte loads and stores. |
1596 | buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); |
1597 | SetContainsAVXFlags(); |
1598 | } |
1599 | break; |
1600 | |
1601 | case GenTreePutArgStk::Kind::RepInstr: |
1602 | buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI); |
1603 | buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX); |
1604 | buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI); |
1605 | break; |
1606 | |
1607 | default: |
1608 | unreached(); |
1609 | } |
1610 | |
1611 | srcCount = BuildOperandUses(src); |
1612 | buildInternalRegisterUses(); |
1613 | return srcCount; |
1614 | } |
1615 | #endif // FEATURE_PUT_STRUCT_ARG_STK |
1616 | |
1617 | //------------------------------------------------------------------------ |
1618 | // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP. |
1619 | // |
1620 | // Arguments: |
1621 | // tree - The node of interest |
1622 | // |
1623 | // Return Value: |
1624 | // The number of sources consumed by this node. |
1625 | // |
1626 | int LinearScan::BuildLclHeap(GenTree* tree) |
1627 | { |
1628 | int srcCount = 1; |
1629 | |
1630 | // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp): |
1631 | // Here '-' means don't care. |
1632 | // |
1633 | // Size? Init Memory? # temp regs |
1634 | // 0 - 0 (returns 0) |
1635 | // const and <=6 reg words - 0 (pushes '0') |
1636 | // const and >6 reg words Yes 0 (pushes '0') |
1637 | // const and <PageSize No 0 (amd64) 1 (x86) |
1638 | // (x86:tmpReg for sutracting from esp) |
1639 | // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp) |
1640 | // Non-const Yes 0 (regCnt=targetReg and pushes '0') |
1641 | // Non-const No 2 (regCnt and tmpReg for subtracting from sp) |
1642 | // |
1643 | // Note: Here we don't need internal register to be different from targetReg. |
1644 | // Rather, require it to be different from operand's reg. |
1645 | |
1646 | GenTree* size = tree->gtGetOp1(); |
1647 | if (size->IsCnsIntOrI()) |
1648 | { |
1649 | assert(size->isContained()); |
1650 | srcCount = 0; |
1651 | size_t sizeVal = size->gtIntCon.gtIconVal; |
1652 | |
1653 | if (sizeVal == 0) |
1654 | { |
1655 | buildInternalIntRegisterDefForNode(tree); |
1656 | } |
1657 | else |
1658 | { |
1659 | // Compute the amount of memory to properly STACK_ALIGN. |
1660 | // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size. |
1661 | // This should also help in debugging as we can examine the original size specified with localloc. |
1662 | sizeVal = AlignUp(sizeVal, STACK_ALIGN); |
1663 | |
1664 | // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc) |
1665 | // we will generate 'push 0'. |
1666 | assert((sizeVal % REGSIZE_BYTES) == 0); |
1667 | size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES; |
1668 | if (cntRegSizedWords > 6) |
1669 | { |
1670 | if (!compiler->info.compInitMem) |
1671 | { |
1672 | // No need to initialize allocated stack space. |
1673 | if (sizeVal < compiler->eeGetPageSize()) |
1674 | { |
1675 | #ifdef _TARGET_X86_ |
1676 | // x86 needs a register here to avoid generating "sub" on ESP. |
1677 | buildInternalIntRegisterDefForNode(tree); |
1678 | #endif |
1679 | } |
1680 | else |
1681 | { |
1682 | // We need two registers: regCnt and RegTmp |
1683 | buildInternalIntRegisterDefForNode(tree); |
1684 | buildInternalIntRegisterDefForNode(tree); |
1685 | } |
1686 | } |
1687 | } |
1688 | } |
1689 | } |
1690 | else |
1691 | { |
1692 | if (!compiler->info.compInitMem) |
1693 | { |
1694 | buildInternalIntRegisterDefForNode(tree); |
1695 | buildInternalIntRegisterDefForNode(tree); |
1696 | } |
1697 | BuildUse(size); |
1698 | } |
1699 | buildInternalRegisterUses(); |
1700 | BuildDef(tree); |
1701 | return srcCount; |
1702 | } |
1703 | |
1704 | //------------------------------------------------------------------------ |
1705 | // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV. |
1706 | // |
1707 | // Arguments: |
1708 | // tree - The node of interest |
1709 | // |
1710 | // Return Value: |
1711 | // The number of sources consumed by this node. |
1712 | // |
1713 | int LinearScan::BuildModDiv(GenTree* tree) |
1714 | { |
1715 | GenTree* op1 = tree->gtGetOp1(); |
1716 | GenTree* op2 = tree->gtGetOp2(); |
1717 | regMaskTP dstCandidates = RBM_NONE; |
1718 | RefPosition* internalDef = nullptr; |
1719 | int srcCount = 0; |
1720 | |
1721 | if (varTypeIsFloating(tree->TypeGet())) |
1722 | { |
1723 | return BuildSimple(tree); |
1724 | } |
1725 | |
1726 | // Amd64 Div/Idiv instruction: |
1727 | // Dividend in RAX:RDX and computes |
1728 | // Quotient in RAX, Remainder in RDX |
1729 | |
1730 | if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD) |
1731 | { |
1732 | // We are interested in just the remainder. |
1733 | // RAX is used as a trashable register during computation of remainder. |
1734 | dstCandidates = RBM_RDX; |
1735 | } |
1736 | else |
1737 | { |
1738 | // We are interested in just the quotient. |
1739 | // RDX gets used as trashable register during computation of quotient |
1740 | dstCandidates = RBM_RAX; |
1741 | } |
1742 | |
1743 | #ifdef _TARGET_X86_ |
1744 | if (op1->OperGet() == GT_LONG) |
1745 | { |
1746 | assert(op1->isContained()); |
1747 | |
1748 | // To avoid reg move would like to have op1's low part in RAX and high part in RDX. |
1749 | GenTree* loVal = op1->gtGetOp1(); |
1750 | GenTree* hiVal = op1->gtGetOp2(); |
1751 | assert(!loVal->isContained() && !hiVal->isContained()); |
1752 | |
1753 | assert(op2->IsCnsIntOrI()); |
1754 | assert(tree->OperGet() == GT_UMOD); |
1755 | |
1756 | // This situation also requires an internal register. |
1757 | buildInternalIntRegisterDefForNode(tree); |
1758 | |
1759 | BuildUse(loVal, RBM_EAX); |
1760 | BuildUse(hiVal, RBM_EDX); |
1761 | srcCount = 2; |
1762 | } |
1763 | else |
1764 | #endif |
1765 | { |
1766 | // If possible would like to have op1 in RAX to avoid a register move. |
1767 | RefPosition* op1Use = BuildUse(op1, RBM_EAX); |
1768 | tgtPrefUse = op1Use; |
1769 | srcCount = 1; |
1770 | } |
1771 | |
1772 | srcCount += BuildDelayFreeUses(op2, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX)); |
1773 | |
1774 | buildInternalRegisterUses(); |
1775 | |
1776 | regMaskTP killMask = getKillSetForModDiv(tree->AsOp()); |
1777 | BuildDefsWithKills(tree, 1, dstCandidates, killMask); |
1778 | return srcCount; |
1779 | } |
1780 | |
1781 | //------------------------------------------------------------------------ |
1782 | // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC. |
1783 | // |
1784 | // Arguments: |
1785 | // tree - The node of interest |
1786 | // |
1787 | // Return Value: |
1788 | // The number of sources consumed by this node. |
1789 | // |
1790 | int LinearScan::BuildIntrinsic(GenTree* tree) |
1791 | { |
1792 | // Both operand and its result must be of floating point type. |
1793 | GenTree* op1 = tree->gtGetOp1(); |
1794 | assert(varTypeIsFloating(op1)); |
1795 | assert(op1->TypeGet() == tree->TypeGet()); |
1796 | RefPosition* internalFloatDef = nullptr; |
1797 | |
1798 | switch (tree->gtIntrinsic.gtIntrinsicId) |
1799 | { |
1800 | case CORINFO_INTRINSIC_Abs: |
1801 | // Abs(float x) = x & 0x7fffffff |
1802 | // Abs(double x) = x & 0x7ffffff ffffffff |
1803 | |
1804 | // In case of Abs we need an internal register to hold mask. |
1805 | |
1806 | // TODO-XArch-CQ: avoid using an internal register for the mask. |
1807 | // Andps or andpd both will operate on 128-bit operands. |
1808 | // The data section constant to hold the mask is a 64-bit size. |
1809 | // Therefore, we need both the operand and mask to be in |
1810 | // xmm register. When we add support in emitter to emit 128-bit |
1811 | // data constants and instructions that operate on 128-bit |
1812 | // memory operands we can avoid the need for an internal register. |
1813 | if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs) |
1814 | { |
1815 | internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates()); |
1816 | } |
1817 | break; |
1818 | |
1819 | #ifdef _TARGET_X86_ |
1820 | case CORINFO_INTRINSIC_Cos: |
1821 | case CORINFO_INTRINSIC_Sin: |
1822 | NYI_X86("Math intrinsics Cos and Sin" ); |
1823 | break; |
1824 | #endif // _TARGET_X86_ |
1825 | |
1826 | case CORINFO_INTRINSIC_Sqrt: |
1827 | case CORINFO_INTRINSIC_Round: |
1828 | case CORINFO_INTRINSIC_Ceiling: |
1829 | case CORINFO_INTRINSIC_Floor: |
1830 | break; |
1831 | |
1832 | default: |
1833 | // Right now only Sqrt/Abs are treated as math intrinsics |
1834 | noway_assert(!"Unsupported math intrinsic" ); |
1835 | unreached(); |
1836 | break; |
1837 | } |
1838 | assert(tree->gtGetOp2IfPresent() == nullptr); |
1839 | int srcCount; |
1840 | if (op1->isContained()) |
1841 | { |
1842 | srcCount = BuildOperandUses(op1); |
1843 | } |
1844 | else |
1845 | { |
1846 | tgtPrefUse = BuildUse(op1); |
1847 | srcCount = 1; |
1848 | } |
1849 | if (internalFloatDef != nullptr) |
1850 | { |
1851 | buildInternalRegisterUses(); |
1852 | } |
1853 | BuildDef(tree); |
1854 | return srcCount; |
1855 | } |
1856 | |
1857 | #ifdef FEATURE_SIMD |
1858 | //------------------------------------------------------------------------ |
1859 | // BuildSIMD: Set the NodeInfo for a GT_SIMD tree. |
1860 | // |
1861 | // Arguments: |
1862 | // tree - The GT_SIMD node of interest |
1863 | // |
1864 | // Return Value: |
1865 | // The number of sources consumed by this node. |
1866 | // |
1867 | int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) |
1868 | { |
1869 | // Only SIMDIntrinsicInit can be contained. Other than that, |
1870 | // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount. |
1871 | int dstCount = simdTree->IsValue() ? 1 : 0; |
1872 | bool buildUses = true; |
1873 | regMaskTP dstCandidates = RBM_NONE; |
1874 | |
1875 | if (simdTree->isContained()) |
1876 | { |
1877 | assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit); |
1878 | } |
1879 | else if (dstCount != 1) |
1880 | { |
1881 | assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) || |
1882 | (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality)); |
1883 | } |
1884 | SetContainsAVXFlags(true, simdTree->gtSIMDSize); |
1885 | GenTree* op1 = simdTree->gtGetOp1(); |
1886 | GenTree* op2 = simdTree->gtGetOp2(); |
1887 | int srcCount = 0; |
1888 | |
1889 | switch (simdTree->gtSIMDIntrinsicID) |
1890 | { |
1891 | case SIMDIntrinsicInit: |
1892 | { |
1893 | // This sets all fields of a SIMD struct to the given value. |
1894 | // Mark op1 as contained if it is either zero or int constant of all 1's, |
1895 | // or a float constant with 16 or 32 byte simdType (AVX case) |
1896 | // |
1897 | // Note that for small int base types, the initVal has been constructed so that |
1898 | // we can use the full int value. |
1899 | CLANG_FORMAT_COMMENT_ANCHOR; |
1900 | |
1901 | #if !defined(_TARGET_64BIT_) |
1902 | if (op1->OperGet() == GT_LONG) |
1903 | { |
1904 | assert(op1->isContained()); |
1905 | GenTree* op1lo = op1->gtGetOp1(); |
1906 | GenTree* op1hi = op1->gtGetOp2(); |
1907 | |
1908 | if (op1lo->isContained()) |
1909 | { |
1910 | srcCount = 0; |
1911 | assert(op1hi->isContained()); |
1912 | assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) || |
1913 | (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))); |
1914 | } |
1915 | else |
1916 | { |
1917 | srcCount = 2; |
1918 | buildInternalFloatRegisterDefForNode(simdTree); |
1919 | setInternalRegsDelayFree = true; |
1920 | } |
1921 | |
1922 | if (srcCount == 2) |
1923 | { |
1924 | BuildUse(op1lo, RBM_EAX); |
1925 | BuildUse(op1hi, RBM_EDX); |
1926 | } |
1927 | buildUses = false; |
1928 | } |
1929 | #endif // !defined(_TARGET_64BIT_) |
1930 | } |
1931 | break; |
1932 | |
1933 | case SIMDIntrinsicInitN: |
1934 | { |
1935 | var_types baseType = simdTree->gtSIMDBaseType; |
1936 | srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(baseType)); |
1937 | // Need an internal register to stitch together all the values into a single vector in a SIMD reg. |
1938 | buildInternalFloatRegisterDefForNode(simdTree); |
1939 | int initCount = 0; |
1940 | for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2()) |
1941 | { |
1942 | assert(list->OperGet() == GT_LIST); |
1943 | GenTree* listItem = list->gtGetOp1(); |
1944 | assert(listItem->TypeGet() == baseType); |
1945 | assert(!listItem->isContained()); |
1946 | BuildUse(listItem); |
1947 | initCount++; |
1948 | } |
1949 | assert(initCount == srcCount); |
1950 | buildUses = false; |
1951 | } |
1952 | break; |
1953 | |
1954 | case SIMDIntrinsicInitArray: |
1955 | // We have an array and an index, which may be contained. |
1956 | break; |
1957 | |
1958 | case SIMDIntrinsicDiv: |
1959 | // SSE2 has no instruction support for division on integer vectors |
1960 | noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType)); |
1961 | break; |
1962 | |
1963 | case SIMDIntrinsicAbs: |
1964 | // float/double vectors: This gets implemented as bitwise-And operation |
1965 | // with a mask and hence should never see here. |
1966 | // |
1967 | // Must be a Vector<int> or Vector<short> Vector<sbyte> |
1968 | assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT || |
1969 | simdTree->gtSIMDBaseType == TYP_BYTE); |
1970 | assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported); |
1971 | break; |
1972 | |
1973 | case SIMDIntrinsicSqrt: |
1974 | // SSE2 has no instruction support for sqrt on integer vectors. |
1975 | noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType)); |
1976 | break; |
1977 | |
1978 | case SIMDIntrinsicAdd: |
1979 | case SIMDIntrinsicSub: |
1980 | case SIMDIntrinsicMul: |
1981 | case SIMDIntrinsicBitwiseAnd: |
1982 | case SIMDIntrinsicBitwiseAndNot: |
1983 | case SIMDIntrinsicBitwiseOr: |
1984 | case SIMDIntrinsicBitwiseXor: |
1985 | case SIMDIntrinsicMin: |
1986 | case SIMDIntrinsicMax: |
1987 | // SSE2 32-bit integer multiplication requires two temp regs |
1988 | if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT && |
1989 | compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) |
1990 | { |
1991 | buildInternalFloatRegisterDefForNode(simdTree); |
1992 | buildInternalFloatRegisterDefForNode(simdTree); |
1993 | } |
1994 | break; |
1995 | |
1996 | case SIMDIntrinsicEqual: |
1997 | break; |
1998 | |
1999 | // SSE2 doesn't support < and <= directly on int vectors. |
2000 | // Instead we need to use > and >= with swapped operands. |
2001 | case SIMDIntrinsicLessThan: |
2002 | case SIMDIntrinsicLessThanOrEqual: |
2003 | noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType)); |
2004 | break; |
2005 | |
2006 | // SIMDIntrinsicEqual is supported only on non-floating point base type vectors. |
2007 | // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors. |
2008 | // Instead we need to use < and <= with swapped operands. |
2009 | case SIMDIntrinsicGreaterThan: |
2010 | noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType)); |
2011 | break; |
2012 | |
2013 | case SIMDIntrinsicOpEquality: |
2014 | case SIMDIntrinsicOpInEquality: |
2015 | if (simdTree->gtGetOp2()->isContained()) |
2016 | { |
2017 | // If the second operand is contained then ContainCheckSIMD has determined |
2018 | // that PTEST can be used. We only need a single source register and no |
2019 | // internal registers. |
2020 | } |
2021 | else |
2022 | { |
2023 | // Can't use PTEST so we need 2 source registers, 1 internal SIMD register |
2024 | // (to hold the result of PCMPEQD or other similar SIMD compare instruction) |
2025 | // and one internal INT register (to hold the result of PMOVMSKB). |
2026 | buildInternalIntRegisterDefForNode(simdTree); |
2027 | buildInternalFloatRegisterDefForNode(simdTree); |
2028 | } |
2029 | // These SIMD nodes only set the condition flags. |
2030 | dstCount = 0; |
2031 | break; |
2032 | |
2033 | case SIMDIntrinsicDotProduct: |
2034 | // Float/Double vectors: |
2035 | // For SSE, or AVX with 32-byte vectors, we also need an internal register |
2036 | // as scratch. Further we need the targetReg and internal reg to be distinct |
2037 | // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we |
2038 | // don't need a tmpReg. |
2039 | // |
2040 | // 32-byte integer vector on SSE4/AVX: |
2041 | // will take advantage of phaddd, which operates only on 128-bit xmm reg. |
2042 | // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal |
2043 | // registers since targetReg is an int type register. |
2044 | // |
2045 | // See genSIMDIntrinsicDotProduct() for details on code sequence generated |
2046 | // and the need for scratch registers. |
2047 | if (varTypeIsFloating(simdTree->gtSIMDBaseType)) |
2048 | { |
2049 | if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || |
2050 | (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32)) |
2051 | { |
2052 | buildInternalFloatRegisterDefForNode(simdTree); |
2053 | setInternalRegsDelayFree = true; |
2054 | } |
2055 | // else don't need scratch reg(s). |
2056 | } |
2057 | else |
2058 | { |
2059 | assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported); |
2060 | |
2061 | // No need to setInternalRegsDelayFree since targetReg is a |
2062 | // an int type reg and guaranteed to be different from xmm/ymm |
2063 | // regs. |
2064 | buildInternalFloatRegisterDefForNode(simdTree); |
2065 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
2066 | { |
2067 | buildInternalFloatRegisterDefForNode(simdTree); |
2068 | } |
2069 | } |
2070 | break; |
2071 | |
2072 | case SIMDIntrinsicGetItem: |
2073 | { |
2074 | // This implements get_Item method. The sources are: |
2075 | // - the source SIMD struct |
2076 | // - index (which element to get) |
2077 | // The result is baseType of SIMD struct. |
2078 | // op1 may be a contained memory op, but if so we will consume its address. |
2079 | // op2 may be a contained constant. |
2080 | op1 = simdTree->gtGetOp1(); |
2081 | op2 = simdTree->gtGetOp2(); |
2082 | |
2083 | if (!op1->isContained()) |
2084 | { |
2085 | // If the index is not a constant, we will use the SIMD temp location to store the vector. |
2086 | // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we |
2087 | // can use that in the process of extracting the element. |
2088 | // |
2089 | // If the index is a constant and base type is a small int we can use pextrw, but on AVX |
2090 | // we will need a temp if are indexing into the upper half of the AVX register. |
2091 | // In all other cases with constant index, we need a temp xmm register to extract the |
2092 | // element if index is other than zero. |
2093 | |
2094 | if (!op2->IsCnsIntOrI()) |
2095 | { |
2096 | (void)compiler->getSIMDInitTempVarNum(); |
2097 | } |
2098 | else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) |
2099 | { |
2100 | bool needFloatTemp; |
2101 | if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && |
2102 | (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)) |
2103 | { |
2104 | int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); |
2105 | needFloatTemp = (byteShiftCnt >= 16); |
2106 | } |
2107 | else |
2108 | { |
2109 | needFloatTemp = !op2->IsIntegralConst(0); |
2110 | } |
2111 | |
2112 | if (needFloatTemp) |
2113 | { |
2114 | buildInternalFloatRegisterDefForNode(simdTree); |
2115 | } |
2116 | } |
2117 | #ifdef _TARGET_X86_ |
2118 | // This logic is duplicated from genSIMDIntrinsicGetItem(). |
2119 | // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to |
2120 | // generate a movzx/movsx. On x86, these require byteable registers. So figure out which |
2121 | // cases will require this, so the non-byteable registers can be excluded. |
2122 | |
2123 | var_types baseType = simdTree->gtSIMDBaseType; |
2124 | if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType)) |
2125 | { |
2126 | bool ZeroOrSignExtnReqd = true; |
2127 | unsigned baseSize = genTypeSize(baseType); |
2128 | if (baseSize == 1) |
2129 | { |
2130 | if ((op2->gtIntCon.gtIconVal % 2) == 1) |
2131 | { |
2132 | ZeroOrSignExtnReqd = (baseType == TYP_BYTE); |
2133 | } |
2134 | } |
2135 | else |
2136 | { |
2137 | assert(baseSize == 2); |
2138 | ZeroOrSignExtnReqd = (baseType == TYP_SHORT); |
2139 | } |
2140 | if (ZeroOrSignExtnReqd) |
2141 | { |
2142 | dstCandidates = allByteRegs(); |
2143 | } |
2144 | } |
2145 | #endif // _TARGET_X86_ |
2146 | } |
2147 | } |
2148 | break; |
2149 | |
2150 | case SIMDIntrinsicSetX: |
2151 | case SIMDIntrinsicSetY: |
2152 | case SIMDIntrinsicSetZ: |
2153 | case SIMDIntrinsicSetW: |
2154 | // We need an internal integer register for SSE2 codegen |
2155 | if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) |
2156 | { |
2157 | buildInternalIntRegisterDefForNode(simdTree); |
2158 | } |
2159 | |
2160 | break; |
2161 | |
2162 | case SIMDIntrinsicCast: |
2163 | break; |
2164 | |
2165 | case SIMDIntrinsicConvertToSingle: |
2166 | if (simdTree->gtSIMDBaseType == TYP_UINT) |
2167 | { |
2168 | // We need an internal register different from targetReg. |
2169 | setInternalRegsDelayFree = true; |
2170 | buildInternalFloatRegisterDefForNode(simdTree); |
2171 | buildInternalFloatRegisterDefForNode(simdTree); |
2172 | // We also need an integer register. |
2173 | buildInternalIntRegisterDefForNode(simdTree); |
2174 | } |
2175 | break; |
2176 | |
2177 | case SIMDIntrinsicConvertToInt32: |
2178 | break; |
2179 | |
2180 | case SIMDIntrinsicWidenLo: |
2181 | case SIMDIntrinsicWidenHi: |
2182 | if (varTypeIsIntegral(simdTree->gtSIMDBaseType)) |
2183 | { |
2184 | // We need an internal register different from targetReg. |
2185 | setInternalRegsDelayFree = true; |
2186 | buildInternalFloatRegisterDefForNode(simdTree); |
2187 | } |
2188 | break; |
2189 | |
2190 | case SIMDIntrinsicConvertToInt64: |
2191 | // We need an internal register different from targetReg. |
2192 | setInternalRegsDelayFree = true; |
2193 | buildInternalFloatRegisterDefForNode(simdTree); |
2194 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
2195 | { |
2196 | buildInternalFloatRegisterDefForNode(simdTree); |
2197 | } |
2198 | // We also need an integer register. |
2199 | buildInternalIntRegisterDefForNode(simdTree); |
2200 | break; |
2201 | |
2202 | case SIMDIntrinsicConvertToDouble: |
2203 | // We need an internal register different from targetReg. |
2204 | setInternalRegsDelayFree = true; |
2205 | buildInternalFloatRegisterDefForNode(simdTree); |
2206 | #ifdef _TARGET_X86_ |
2207 | if (simdTree->gtSIMDBaseType == TYP_LONG) |
2208 | { |
2209 | buildInternalFloatRegisterDefForNode(simdTree); |
2210 | buildInternalFloatRegisterDefForNode(simdTree); |
2211 | } |
2212 | else |
2213 | #endif |
2214 | if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG)) |
2215 | { |
2216 | buildInternalFloatRegisterDefForNode(simdTree); |
2217 | } |
2218 | // We also need an integer register. |
2219 | buildInternalIntRegisterDefForNode(simdTree); |
2220 | break; |
2221 | |
2222 | case SIMDIntrinsicNarrow: |
2223 | // We need an internal register different from targetReg. |
2224 | setInternalRegsDelayFree = true; |
2225 | buildInternalFloatRegisterDefForNode(simdTree); |
2226 | if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE)) |
2227 | { |
2228 | buildInternalFloatRegisterDefForNode(simdTree); |
2229 | } |
2230 | break; |
2231 | |
2232 | case SIMDIntrinsicShuffleSSE2: |
2233 | // Second operand is an integer constant and marked as contained. |
2234 | assert(simdTree->gtGetOp2()->isContainedIntOrIImmed()); |
2235 | break; |
2236 | |
2237 | case SIMDIntrinsicGetX: |
2238 | case SIMDIntrinsicGetY: |
2239 | case SIMDIntrinsicGetZ: |
2240 | case SIMDIntrinsicGetW: |
2241 | case SIMDIntrinsicGetOne: |
2242 | case SIMDIntrinsicGetZero: |
2243 | case SIMDIntrinsicGetCount: |
2244 | case SIMDIntrinsicGetAllOnes: |
2245 | assert(!"Get intrinsics should not be seen during Lowering." ); |
2246 | unreached(); |
2247 | |
2248 | default: |
2249 | noway_assert(!"Unimplemented SIMD node type." ); |
2250 | unreached(); |
2251 | } |
2252 | if (buildUses) |
2253 | { |
2254 | assert(!op1->OperIs(GT_LIST)); |
2255 | assert(srcCount == 0); |
2256 | // This is overly conservative, but is here for zero diffs. |
2257 | srcCount = BuildRMWUses(simdTree); |
2258 | } |
2259 | buildInternalRegisterUses(); |
2260 | if (dstCount == 1) |
2261 | { |
2262 | BuildDef(simdTree, dstCandidates); |
2263 | } |
2264 | else |
2265 | { |
2266 | assert(dstCount == 0); |
2267 | } |
2268 | return srcCount; |
2269 | } |
2270 | #endif // FEATURE_SIMD |
2271 | |
2272 | #ifdef FEATURE_HW_INTRINSICS |
2273 | //------------------------------------------------------------------------ |
2274 | // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree. |
2275 | // |
2276 | // Arguments: |
2277 | // tree - The GT_HWIntrinsic node of interest |
2278 | // |
2279 | // Return Value: |
2280 | // The number of sources consumed by this node. |
2281 | // |
2282 | int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) |
2283 | { |
2284 | NamedIntrinsic intrinsicId = intrinsicTree->gtHWIntrinsicId; |
2285 | var_types baseType = intrinsicTree->gtSIMDBaseType; |
2286 | InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId); |
2287 | HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId); |
2288 | int numArgs = HWIntrinsicInfo::lookupNumArgs(intrinsicTree); |
2289 | |
2290 | if ((isa == InstructionSet_AVX) || (isa == InstructionSet_AVX2)) |
2291 | { |
2292 | SetContainsAVXFlags(true, 32); |
2293 | } |
2294 | |
2295 | GenTree* op1 = intrinsicTree->gtGetOp1(); |
2296 | GenTree* op2 = intrinsicTree->gtGetOp2(); |
2297 | GenTree* op3 = nullptr; |
2298 | GenTree* lastOp = nullptr; |
2299 | |
2300 | int srcCount = 0; |
2301 | int dstCount = intrinsicTree->IsValue() ? 1 : 0; |
2302 | |
2303 | regMaskTP dstCandidates = RBM_NONE; |
2304 | |
2305 | if (op1 == nullptr) |
2306 | { |
2307 | assert(op2 == nullptr); |
2308 | assert(numArgs == 0); |
2309 | } |
2310 | else |
2311 | { |
2312 | if (op1->OperIsList()) |
2313 | { |
2314 | assert(op2 == nullptr); |
2315 | assert(numArgs >= 3); |
2316 | |
2317 | GenTreeArgList* argList = op1->AsArgList(); |
2318 | |
2319 | op1 = argList->Current(); |
2320 | argList = argList->Rest(); |
2321 | |
2322 | op2 = argList->Current(); |
2323 | argList = argList->Rest(); |
2324 | |
2325 | op3 = argList->Current(); |
2326 | |
2327 | while (argList->Rest() != nullptr) |
2328 | { |
2329 | argList = argList->Rest(); |
2330 | } |
2331 | |
2332 | lastOp = argList->Current(); |
2333 | argList = argList->Rest(); |
2334 | |
2335 | assert(argList == nullptr); |
2336 | } |
2337 | else if (op2 != nullptr) |
2338 | { |
2339 | assert(numArgs == 2); |
2340 | lastOp = op2; |
2341 | } |
2342 | else |
2343 | { |
2344 | assert(numArgs == 1); |
2345 | lastOp = op1; |
2346 | } |
2347 | |
2348 | assert(lastOp != nullptr); |
2349 | |
2350 | bool buildUses = true; |
2351 | |
2352 | if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId)) |
2353 | { |
2354 | if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed()) |
2355 | { |
2356 | assert(!lastOp->IsCnsIntOrI()); |
2357 | |
2358 | // We need two extra reg when lastOp isn't a constant so |
2359 | // the offset into the jump table for the fallback path |
2360 | // can be computed. |
2361 | buildInternalIntRegisterDefForNode(intrinsicTree); |
2362 | buildInternalIntRegisterDefForNode(intrinsicTree); |
2363 | } |
2364 | } |
2365 | |
2366 | // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it |
2367 | // is not allocated the same register as the target. |
2368 | bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); |
2369 | |
2370 | // Create internal temps, and handle any other special requirements. |
2371 | // Note that the default case for building uses will handle the RMW flag, but if the uses |
2372 | // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree) |
2373 | // must be handled within the case. |
2374 | switch (intrinsicId) |
2375 | { |
2376 | case NI_Base_Vector128_CreateScalarUnsafe: |
2377 | case NI_Base_Vector128_ToScalar: |
2378 | case NI_Base_Vector256_CreateScalarUnsafe: |
2379 | case NI_Base_Vector256_ToScalar: |
2380 | { |
2381 | assert(numArgs == 1); |
2382 | |
2383 | if (varTypeIsFloating(baseType)) |
2384 | { |
2385 | if (op1->isContained()) |
2386 | { |
2387 | srcCount += BuildOperandUses(op1); |
2388 | } |
2389 | else |
2390 | { |
2391 | // We will either be in memory and need to be moved |
2392 | // into a register of the appropriate size or we |
2393 | // are already in an XMM/YMM register and can stay |
2394 | // where we are. |
2395 | |
2396 | tgtPrefUse = BuildUse(op1); |
2397 | srcCount += 1; |
2398 | } |
2399 | |
2400 | buildUses = false; |
2401 | } |
2402 | break; |
2403 | } |
2404 | |
2405 | case NI_Base_Vector128_ToVector256: |
2406 | case NI_Base_Vector128_ToVector256Unsafe: |
2407 | case NI_Base_Vector256_GetLower: |
2408 | { |
2409 | assert(numArgs == 1); |
2410 | |
2411 | if (op1->isContained()) |
2412 | { |
2413 | srcCount += BuildOperandUses(op1); |
2414 | } |
2415 | else |
2416 | { |
2417 | // We will either be in memory and need to be moved |
2418 | // into a register of the appropriate size or we |
2419 | // are already in an XMM/YMM register and can stay |
2420 | // where we are. |
2421 | |
2422 | tgtPrefUse = BuildUse(op1); |
2423 | srcCount += 1; |
2424 | } |
2425 | |
2426 | buildUses = false; |
2427 | break; |
2428 | } |
2429 | |
2430 | case NI_SSE_CompareEqualOrderedScalar: |
2431 | case NI_SSE_CompareEqualUnorderedScalar: |
2432 | case NI_SSE_CompareNotEqualOrderedScalar: |
2433 | case NI_SSE_CompareNotEqualUnorderedScalar: |
2434 | case NI_SSE2_CompareEqualOrderedScalar: |
2435 | case NI_SSE2_CompareEqualUnorderedScalar: |
2436 | case NI_SSE2_CompareNotEqualOrderedScalar: |
2437 | case NI_SSE2_CompareNotEqualUnorderedScalar: |
2438 | { |
2439 | buildInternalIntRegisterDefForNode(intrinsicTree, allByteRegs()); |
2440 | setInternalRegsDelayFree = true; |
2441 | break; |
2442 | } |
2443 | |
2444 | case NI_SSE2_MaskMove: |
2445 | { |
2446 | assert(numArgs == 3); |
2447 | assert(!isRMW); |
2448 | |
2449 | // MaskMove hardcodes the destination (op3) in DI/EDI/RDI |
2450 | srcCount += BuildOperandUses(op1); |
2451 | srcCount += BuildOperandUses(op2); |
2452 | srcCount += BuildOperandUses(op3, RBM_EDI); |
2453 | |
2454 | buildUses = false; |
2455 | break; |
2456 | } |
2457 | |
2458 | case NI_SSE41_BlendVariable: |
2459 | { |
2460 | assert(numArgs == 3); |
2461 | |
2462 | if (!compiler->canUseVexEncoding()) |
2463 | { |
2464 | assert(isRMW); |
2465 | |
2466 | // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 |
2467 | srcCount += BuildOperandUses(op1); |
2468 | srcCount += BuildDelayFreeUses(op2); |
2469 | srcCount += BuildDelayFreeUses(op3, RBM_XMM0); |
2470 | |
2471 | buildUses = false; |
2472 | } |
2473 | break; |
2474 | } |
2475 | |
2476 | case NI_SSE41_TestAllOnes: |
2477 | { |
2478 | buildInternalFloatRegisterDefForNode(intrinsicTree); |
2479 | break; |
2480 | } |
2481 | |
2482 | case NI_SSE41_Extract: |
2483 | { |
2484 | if (baseType == TYP_FLOAT) |
2485 | { |
2486 | buildInternalIntRegisterDefForNode(intrinsicTree); |
2487 | } |
2488 | #ifdef _TARGET_X86_ |
2489 | else if (varTypeIsByte(baseType)) |
2490 | { |
2491 | dstCandidates = allByteRegs(); |
2492 | } |
2493 | #endif |
2494 | break; |
2495 | } |
2496 | |
2497 | #ifdef _TARGET_X86_ |
2498 | case NI_SSE42_Crc32: |
2499 | case NI_SSE42_X64_Crc32: |
2500 | { |
2501 | // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument |
2502 | // to the code generator. We may want to encode the overload info in another way. |
2503 | |
2504 | assert(numArgs == 2); |
2505 | assert(isRMW); |
2506 | |
2507 | // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers. |
2508 | srcCount += BuildOperandUses(op1); |
2509 | srcCount += BuildDelayFreeUses(op2, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE); |
2510 | |
2511 | buildUses = false; |
2512 | break; |
2513 | } |
2514 | #endif // _TARGET_X86_ |
2515 | |
2516 | case NI_BMI2_MultiplyNoFlags: |
2517 | case NI_BMI2_X64_MultiplyNoFlags: |
2518 | { |
2519 | assert(numArgs == 2 || numArgs == 3); |
2520 | srcCount += BuildOperandUses(op1, RBM_EDX); |
2521 | srcCount += BuildOperandUses(op2); |
2522 | if (numArgs == 3) |
2523 | { |
2524 | // op3 reg should be different from target reg to |
2525 | // store the lower half result after executing the instruction |
2526 | srcCount += BuildDelayFreeUses(op3); |
2527 | // Need a internal register different from the dst to take the lower half result |
2528 | buildInternalIntRegisterDefForNode(intrinsicTree); |
2529 | setInternalRegsDelayFree = true; |
2530 | } |
2531 | buildUses = false; |
2532 | break; |
2533 | } |
2534 | |
2535 | case NI_FMA_MultiplyAdd: |
2536 | case NI_FMA_MultiplyAddNegated: |
2537 | case NI_FMA_MultiplyAddNegatedScalar: |
2538 | case NI_FMA_MultiplyAddScalar: |
2539 | case NI_FMA_MultiplyAddSubtract: |
2540 | case NI_FMA_MultiplySubtract: |
2541 | case NI_FMA_MultiplySubtractAdd: |
2542 | case NI_FMA_MultiplySubtractNegated: |
2543 | case NI_FMA_MultiplySubtractNegatedScalar: |
2544 | case NI_FMA_MultiplySubtractScalar: |
2545 | { |
2546 | assert(numArgs == 3); |
2547 | assert(isRMW); |
2548 | |
2549 | const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId); |
2550 | |
2551 | // Intrinsics with CopyUpperBits semantics cannot have op1 be contained |
2552 | assert(!copiesUpperBits || !op1->isContained()); |
2553 | |
2554 | if (op3->isContained()) |
2555 | { |
2556 | // 213 form: op1 = (op2 * op1) + [op3] |
2557 | |
2558 | if (copiesUpperBits) |
2559 | { |
2560 | tgtPrefUse = BuildUse(op1); |
2561 | |
2562 | srcCount += 1; |
2563 | srcCount += BuildDelayFreeUses(op2); |
2564 | } |
2565 | else |
2566 | { |
2567 | // op1 and op2 are commutative, so don't |
2568 | // set either to be tgtPref or delayFree |
2569 | |
2570 | srcCount += BuildOperandUses(op1); |
2571 | srcCount += BuildOperandUses(op2); |
2572 | } |
2573 | |
2574 | srcCount += BuildOperandUses(op3); |
2575 | } |
2576 | else if (op2->isContained()) |
2577 | { |
2578 | // 132 form: op1 = (op1 * op3) + [op2] |
2579 | |
2580 | tgtPrefUse = BuildUse(op1); |
2581 | |
2582 | srcCount += 1; |
2583 | srcCount += BuildOperandUses(op2); |
2584 | srcCount += BuildDelayFreeUses(op3); |
2585 | } |
2586 | else if (op1->isContained()) |
2587 | { |
2588 | // 231 form: op3 = (op2 * op3) + [op1] |
2589 | |
2590 | tgtPrefUse = BuildUse(op3); |
2591 | |
2592 | srcCount += BuildOperandUses(op1); |
2593 | srcCount += BuildDelayFreeUses(op2); |
2594 | srcCount += 1; |
2595 | } |
2596 | else |
2597 | { |
2598 | // 213 form: op1 = (op2 * op1) + op3 |
2599 | |
2600 | if (copiesUpperBits) |
2601 | { |
2602 | tgtPrefUse = BuildUse(op1); |
2603 | |
2604 | srcCount += 1; |
2605 | srcCount += BuildDelayFreeUses(op2); |
2606 | } |
2607 | else |
2608 | { |
2609 | // op1 and op2 are commutative, so don't |
2610 | // set either to be tgtPref or delayFree |
2611 | |
2612 | srcCount += BuildOperandUses(op1); |
2613 | srcCount += BuildOperandUses(op2); |
2614 | } |
2615 | |
2616 | srcCount += BuildDelayFreeUses(op3); |
2617 | } |
2618 | |
2619 | buildUses = false; |
2620 | break; |
2621 | } |
2622 | |
2623 | case NI_AVX2_GatherVector128: |
2624 | case NI_AVX2_GatherVector256: |
2625 | { |
2626 | assert(numArgs == 3); |
2627 | // Any pair of the index, mask, or destination registers should be different |
2628 | srcCount += BuildOperandUses(op1); |
2629 | srcCount += BuildDelayFreeUses(op2); |
2630 | |
2631 | // get a tmp register for mask that will be cleared by gather instructions |
2632 | buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); |
2633 | setInternalRegsDelayFree = true; |
2634 | |
2635 | buildUses = false; |
2636 | break; |
2637 | } |
2638 | |
2639 | case NI_AVX2_GatherMaskVector128: |
2640 | case NI_AVX2_GatherMaskVector256: |
2641 | { |
2642 | assert(numArgs == 5); |
2643 | // Any pair of the index, mask, or destination registers should be different |
2644 | srcCount += BuildOperandUses(op1); |
2645 | srcCount += BuildOperandUses(op2); |
2646 | srcCount += BuildDelayFreeUses(op3); |
2647 | |
2648 | assert(intrinsicTree->gtGetOp1()->OperIsList()); |
2649 | GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList(); |
2650 | GenTree* op4 = argList->Rest()->Rest()->Rest()->Current(); |
2651 | srcCount += BuildDelayFreeUses(op4); |
2652 | |
2653 | // get a tmp register for mask that will be cleared by gather instructions |
2654 | buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); |
2655 | setInternalRegsDelayFree = true; |
2656 | |
2657 | buildUses = false; |
2658 | break; |
2659 | } |
2660 | |
2661 | default: |
2662 | { |
2663 | assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); |
2664 | break; |
2665 | } |
2666 | } |
2667 | |
2668 | if (buildUses) |
2669 | { |
2670 | assert((numArgs > 0) && (numArgs < 4)); |
2671 | |
2672 | srcCount += BuildOperandUses(op1); |
2673 | |
2674 | if (op2 != nullptr) |
2675 | { |
2676 | srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2); |
2677 | |
2678 | if (op3 != nullptr) |
2679 | { |
2680 | srcCount += (isRMW) ? BuildDelayFreeUses(op3) : BuildOperandUses(op3); |
2681 | } |
2682 | } |
2683 | } |
2684 | |
2685 | buildInternalRegisterUses(); |
2686 | } |
2687 | |
2688 | if (dstCount == 1) |
2689 | { |
2690 | BuildDef(intrinsicTree, dstCandidates); |
2691 | } |
2692 | else |
2693 | { |
2694 | assert(dstCount == 0); |
2695 | } |
2696 | |
2697 | return srcCount; |
2698 | } |
2699 | #endif |
2700 | |
2701 | //------------------------------------------------------------------------ |
2702 | // BuildCast: Set the NodeInfo for a GT_CAST. |
2703 | // |
2704 | // Arguments: |
2705 | // cast - The GT_CAST node |
2706 | // |
2707 | // Return Value: |
2708 | // The number of sources consumed by this node. |
2709 | // |
2710 | int LinearScan::BuildCast(GenTreeCast* cast) |
2711 | { |
2712 | GenTree* src = cast->gtGetOp1(); |
2713 | |
2714 | const var_types srcType = genActualType(src->TypeGet()); |
2715 | const var_types castType = cast->gtCastType; |
2716 | |
2717 | regMaskTP candidates = RBM_NONE; |
2718 | #ifdef _TARGET_X86_ |
2719 | if (varTypeIsByte(castType)) |
2720 | { |
2721 | candidates = allByteRegs(); |
2722 | } |
2723 | |
2724 | assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained())); |
2725 | #else |
2726 | // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary |
2727 | // register to extract the upper 32 bits of the 64 bit source register. |
2728 | if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT)) |
2729 | { |
2730 | // Here we don't need internal register to be different from targetReg, |
2731 | // rather require it to be different from operand's reg. |
2732 | buildInternalIntRegisterDefForNode(cast); |
2733 | } |
2734 | #endif |
2735 | |
2736 | int srcCount = BuildOperandUses(src, candidates); |
2737 | buildInternalRegisterUses(); |
2738 | BuildDef(cast, candidates); |
2739 | return srcCount; |
2740 | } |
2741 | |
2742 | //----------------------------------------------------------------------------------------- |
2743 | // BuildIndir: Specify register requirements for address expression of an indirection operation. |
2744 | // |
2745 | // Arguments: |
2746 | // indirTree - GT_IND or GT_STOREIND gentree node |
2747 | // |
2748 | // Return Value: |
2749 | // The number of sources consumed by this node. |
2750 | // |
2751 | int LinearScan::BuildIndir(GenTreeIndir* indirTree) |
2752 | { |
2753 | // If this is the rhs of a block copy (i.e. non-enregisterable struct), |
2754 | // it has no register requirements. |
2755 | if (indirTree->TypeGet() == TYP_STRUCT) |
2756 | { |
2757 | return 0; |
2758 | } |
2759 | |
2760 | #ifdef FEATURE_SIMD |
2761 | RefPosition* internalFloatDef = nullptr; |
2762 | if (indirTree->TypeGet() == TYP_SIMD12) |
2763 | { |
2764 | // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir(). |
2765 | assert(!indirTree->Addr()->isContained()); |
2766 | |
2767 | // Vector3 is read/written as two reads/writes: 8 byte and 4 byte. |
2768 | // To assemble the vector properly we would need an additional |
2769 | // XMM register. |
2770 | internalFloatDef = buildInternalFloatRegisterDefForNode(indirTree); |
2771 | |
2772 | // In case of GT_IND we need an internal register different from targetReg and |
2773 | // both of the registers are used at the same time. |
2774 | if (indirTree->OperGet() == GT_IND) |
2775 | { |
2776 | setInternalRegsDelayFree = true; |
2777 | } |
2778 | } |
2779 | #endif // FEATURE_SIMD |
2780 | |
2781 | regMaskTP indirCandidates = RBM_NONE; |
2782 | int srcCount = BuildIndirUses(indirTree, indirCandidates); |
2783 | if (indirTree->gtOper == GT_STOREIND) |
2784 | { |
2785 | GenTree* source = indirTree->gtGetOp2(); |
2786 | if (indirTree->AsStoreInd()->IsRMWMemoryOp()) |
2787 | { |
2788 | // Because 'source' is contained, we haven't yet determined its special register requirements, if any. |
2789 | // As it happens, the Shift or Rotate cases are the only ones with special requirements. |
2790 | assert(source->isContained() && source->OperIsRMWMemOp()); |
2791 | GenTree* nonMemSource = nullptr; |
2792 | GenTreeIndir* otherIndir = nullptr; |
2793 | |
2794 | if (source->OperIsShiftOrRotate()) |
2795 | { |
2796 | srcCount += BuildShiftRotate(source); |
2797 | } |
2798 | else |
2799 | { |
2800 | regMaskTP srcCandidates = RBM_NONE; |
2801 | |
2802 | #ifdef _TARGET_X86_ |
2803 | // Determine if we need byte regs for the non-mem source, if any. |
2804 | // Note that BuildShiftRotate (above) will handle the byte requirement as needed, |
2805 | // but STOREIND isn't itself an RMW op, so we have to explicitly set it for that case. |
2806 | |
2807 | GenTree* nonMemSource = nullptr; |
2808 | |
2809 | if (indirTree->AsStoreInd()->IsRMWDstOp1()) |
2810 | { |
2811 | otherIndir = source->gtGetOp1()->AsIndir(); |
2812 | if (source->OperIsBinary()) |
2813 | { |
2814 | nonMemSource = source->gtGetOp2(); |
2815 | } |
2816 | } |
2817 | else if (indirTree->AsStoreInd()->IsRMWDstOp2()) |
2818 | { |
2819 | otherIndir = source->gtGetOp2()->AsIndir(); |
2820 | nonMemSource = source->gtGetOp1(); |
2821 | } |
2822 | if ((nonMemSource != nullptr) && !nonMemSource->isContained() && varTypeIsByte(indirTree)) |
2823 | { |
2824 | srcCandidates = RBM_BYTE_REGS; |
2825 | } |
2826 | #endif |
2827 | if (otherIndir != nullptr) |
2828 | { |
2829 | // Any lclVars in the addressing mode of this indirection are contained. |
2830 | // If they are marked as lastUse, transfer the last use flag to the store indir. |
2831 | GenTree* base = otherIndir->Base(); |
2832 | GenTree* dstBase = indirTree->Base(); |
2833 | CheckAndMoveRMWLastUse(base, dstBase); |
2834 | GenTree* index = otherIndir->Index(); |
2835 | GenTree* dstIndex = indirTree->Index(); |
2836 | CheckAndMoveRMWLastUse(index, dstIndex); |
2837 | } |
2838 | srcCount += BuildBinaryUses(source->AsOp(), srcCandidates); |
2839 | } |
2840 | } |
2841 | else |
2842 | { |
2843 | #ifdef _TARGET_X86_ |
2844 | if (varTypeIsByte(indirTree) && !source->isContained()) |
2845 | { |
2846 | BuildUse(source, allByteRegs()); |
2847 | srcCount++; |
2848 | } |
2849 | else |
2850 | #endif |
2851 | { |
2852 | srcCount += BuildOperandUses(source); |
2853 | } |
2854 | } |
2855 | } |
2856 | #ifdef FEATURE_SIMD |
2857 | if (varTypeIsSIMD(indirTree)) |
2858 | { |
2859 | SetContainsAVXFlags(true, genTypeSize(indirTree->TypeGet())); |
2860 | } |
2861 | buildInternalRegisterUses(); |
2862 | #endif // FEATURE_SIMD |
2863 | |
2864 | if (indirTree->gtOper != GT_STOREIND) |
2865 | { |
2866 | BuildDef(indirTree); |
2867 | } |
2868 | return srcCount; |
2869 | } |
2870 | |
2871 | //------------------------------------------------------------------------ |
2872 | // BuildMul: Set the NodeInfo for a multiply. |
2873 | // |
2874 | // Arguments: |
2875 | // tree - The node of interest |
2876 | // |
2877 | // Return Value: |
2878 | // The number of sources consumed by this node. |
2879 | // |
2880 | int LinearScan::BuildMul(GenTree* tree) |
2881 | { |
2882 | assert(tree->OperIsMul()); |
2883 | GenTree* op1 = tree->gtGetOp1(); |
2884 | GenTree* op2 = tree->gtGetOp2(); |
2885 | |
2886 | // Only non-floating point mul has special requirements |
2887 | if (varTypeIsFloating(tree->TypeGet())) |
2888 | { |
2889 | return BuildSimple(tree); |
2890 | } |
2891 | |
2892 | int srcCount = BuildBinaryUses(tree->AsOp()); |
2893 | int dstCount = 1; |
2894 | regMaskTP dstCandidates = RBM_NONE; |
2895 | |
2896 | bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0); |
2897 | bool requiresOverflowCheck = tree->gtOverflowEx(); |
2898 | |
2899 | // There are three forms of x86 multiply: |
2900 | // one-op form: RDX:RAX = RAX * r/m |
2901 | // two-op form: reg *= r/m |
2902 | // three-op form: reg = r/m * imm |
2903 | |
2904 | // This special widening 32x32->64 MUL is not used on x64 |
2905 | CLANG_FORMAT_COMMENT_ANCHOR; |
2906 | #if defined(_TARGET_X86_) |
2907 | if (tree->OperGet() != GT_MUL_LONG) |
2908 | #endif |
2909 | { |
2910 | assert((tree->gtFlags & GTF_MUL_64RSLT) == 0); |
2911 | } |
2912 | |
2913 | // We do use the widening multiply to implement |
2914 | // the overflow checking for unsigned multiply |
2915 | // |
2916 | if (isUnsignedMultiply && requiresOverflowCheck) |
2917 | { |
2918 | // The only encoding provided is RDX:RAX = RAX * rm |
2919 | // |
2920 | // Here we set RAX as the only destination candidate |
2921 | // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX |
2922 | // |
2923 | dstCandidates = RBM_RAX; |
2924 | } |
2925 | else if (tree->OperGet() == GT_MULHI) |
2926 | { |
2927 | // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the |
2928 | // upper 32 bits of the result set the destination candidate to REG_RDX. |
2929 | dstCandidates = RBM_RDX; |
2930 | } |
2931 | #if defined(_TARGET_X86_) |
2932 | else if (tree->OperGet() == GT_MUL_LONG) |
2933 | { |
2934 | // have to use the encoding:RDX:RAX = RAX * rm |
2935 | dstCandidates = RBM_RAX | RBM_RDX; |
2936 | dstCount = 2; |
2937 | } |
2938 | #endif |
2939 | GenTree* containedMemOp = nullptr; |
2940 | if (op1->isContained() && !op1->IsCnsIntOrI()) |
2941 | { |
2942 | assert(!op2->isContained() || op2->IsCnsIntOrI()); |
2943 | containedMemOp = op1; |
2944 | } |
2945 | else if (op2->isContained() && !op2->IsCnsIntOrI()) |
2946 | { |
2947 | containedMemOp = op2; |
2948 | } |
2949 | regMaskTP killMask = getKillSetForMul(tree->AsOp()); |
2950 | BuildDefsWithKills(tree, dstCount, dstCandidates, killMask); |
2951 | return srcCount; |
2952 | } |
2953 | |
2954 | //------------------------------------------------------------------------------ |
2955 | // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set |
2956 | // Contains256bitAVX flag when SIMD vector size is 32 bytes |
2957 | // |
2958 | // Arguments: |
2959 | // isFloatingPointType - true if it is floating point type |
2960 | // sizeOfSIMDVector - SIMD Vector size |
2961 | // |
2962 | void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/) |
2963 | { |
2964 | if (isFloatingPointType && compiler->canUseVexEncoding()) |
2965 | { |
2966 | compiler->getEmitter()->SetContainsAVX(true); |
2967 | if (sizeOfSIMDVector == 32) |
2968 | { |
2969 | compiler->getEmitter()->SetContains256bitAVX(true); |
2970 | } |
2971 | } |
2972 | } |
2973 | |
2974 | #endif // _TARGET_XARCH_ |
2975 | |