1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7XX XX
8XX Amd64 SIMD Code Generator XX
9XX XX
10XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12*/
13#include "jitpch.h"
14#ifdef _MSC_VER
15#pragma hdrstop
16#endif
17
18#ifdef _TARGET_XARCH_
19#ifdef FEATURE_SIMD
20
21#include "emit.h"
22#include "codegen.h"
23#include "sideeffects.h"
24#include "lower.h"
25#include "gcinfo.h"
26#include "gcinfoencoder.h"
27
28// Instruction immediates
29
30// Insertps:
31// - bits 6 and 7 of the immediate indicate which source item to select (0..3)
32// - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
33// - bits 0 to 3 of the immediate indicate which target item to zero
34#define INSERTPS_SOURCE_SELECT(i) (i << 6)
35#define INSERTPS_TARGET_SELECT(i) (i << 4)
36#define INSERTPS_ZERO(i) (1 << i)
37
38// getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
39//
40// Arguments:
41// intrinsicId - SIMD intrinsic Id
42// baseType - Base type of the SIMD vector
43// immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
44//
45//
46// Return Value:
47// Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
48//
49instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/)
50{
51 // Minimal required instruction set is SSE2.
52 assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
53
54 instruction result = INS_invalid;
55 switch (intrinsicId)
56 {
57 case SIMDIntrinsicInit:
58 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
59 {
60 // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
61 // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
62 // If we decide to use AVX2 only, we can remove this assert.
63 if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2))
64 {
65 assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
66 }
67 switch (baseType)
68 {
69 case TYP_FLOAT:
70 result = INS_vbroadcastss;
71 break;
72 case TYP_DOUBLE:
73 result = INS_vbroadcastsd;
74 break;
75 case TYP_ULONG:
76 case TYP_LONG:
77 // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
78 // to be TYP_LONG reg.
79 result = INS_vpbroadcastq;
80 break;
81 case TYP_UINT:
82 case TYP_INT:
83 result = INS_vpbroadcastd;
84 break;
85 case TYP_USHORT:
86 case TYP_SHORT:
87 result = INS_vpbroadcastw;
88 break;
89 case TYP_UBYTE:
90 case TYP_BYTE:
91 result = INS_vpbroadcastb;
92 break;
93 default:
94 unreached();
95 }
96 break;
97 }
98
99 // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
100 __fallthrough;
101
102 case SIMDIntrinsicShuffleSSE2:
103 if (baseType == TYP_FLOAT)
104 {
105 result = INS_shufps;
106 }
107 else if (baseType == TYP_DOUBLE)
108 {
109 result = INS_shufpd;
110 }
111 else if (baseType == TYP_INT || baseType == TYP_UINT)
112 {
113 result = INS_pshufd;
114 }
115 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
116 {
117 // We don't have a separate SSE2 instruction and will
118 // use the instruction meant for doubles since it is
119 // of the same size as a long.
120 result = INS_shufpd;
121 }
122 break;
123
124 case SIMDIntrinsicSqrt:
125 if (baseType == TYP_FLOAT)
126 {
127 result = INS_sqrtps;
128 }
129 else if (baseType == TYP_DOUBLE)
130 {
131 result = INS_sqrtpd;
132 }
133 else
134 {
135 unreached();
136 }
137 break;
138
139 case SIMDIntrinsicAdd:
140 if (baseType == TYP_FLOAT)
141 {
142 result = INS_addps;
143 }
144 else if (baseType == TYP_DOUBLE)
145 {
146 result = INS_addpd;
147 }
148 else if (baseType == TYP_INT || baseType == TYP_UINT)
149 {
150 result = INS_paddd;
151 }
152 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
153 {
154 result = INS_paddw;
155 }
156 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
157 {
158 result = INS_paddb;
159 }
160 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
161 {
162 result = INS_paddq;
163 }
164 break;
165
166 case SIMDIntrinsicSub:
167 if (baseType == TYP_FLOAT)
168 {
169 result = INS_subps;
170 }
171 else if (baseType == TYP_DOUBLE)
172 {
173 result = INS_subpd;
174 }
175 else if (baseType == TYP_INT || baseType == TYP_UINT)
176 {
177 result = INS_psubd;
178 }
179 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
180 {
181 result = INS_psubw;
182 }
183 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
184 {
185 result = INS_psubb;
186 }
187 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
188 {
189 result = INS_psubq;
190 }
191 break;
192
193 case SIMDIntrinsicMul:
194 if (baseType == TYP_FLOAT)
195 {
196 result = INS_mulps;
197 }
198 else if (baseType == TYP_DOUBLE)
199 {
200 result = INS_mulpd;
201 }
202 else if (baseType == TYP_SHORT)
203 {
204 result = INS_pmullw;
205 }
206 else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
207 {
208 result = INS_pmulld;
209 }
210 break;
211
212 case SIMDIntrinsicDiv:
213 if (baseType == TYP_FLOAT)
214 {
215 result = INS_divps;
216 }
217 else if (baseType == TYP_DOUBLE)
218 {
219 result = INS_divpd;
220 }
221 else
222 {
223 unreached();
224 }
225 break;
226
227 case SIMDIntrinsicMin:
228 if (baseType == TYP_FLOAT)
229 {
230 result = INS_minps;
231 }
232 else if (baseType == TYP_DOUBLE)
233 {
234 result = INS_minpd;
235 }
236 else if (baseType == TYP_UBYTE)
237 {
238 result = INS_pminub;
239 }
240 else if (baseType == TYP_SHORT)
241 {
242 result = INS_pminsw;
243 }
244 else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
245 {
246 if (baseType == TYP_BYTE)
247 {
248 result = INS_pminsb;
249 }
250 else if (baseType == TYP_USHORT)
251 {
252 result = INS_pminuw;
253 }
254 else if (baseType == TYP_INT)
255 {
256 result = INS_pminsd;
257 }
258 else if (baseType == TYP_UINT)
259 {
260 result = INS_pminud;
261 }
262 }
263 else
264 {
265 unreached();
266 }
267 break;
268
269 case SIMDIntrinsicMax:
270 if (baseType == TYP_FLOAT)
271 {
272 result = INS_maxps;
273 }
274 else if (baseType == TYP_DOUBLE)
275 {
276 result = INS_maxpd;
277 }
278 else if (baseType == TYP_UBYTE)
279 {
280 result = INS_pmaxub;
281 }
282 else if (baseType == TYP_SHORT)
283 {
284 result = INS_pmaxsw;
285 }
286 else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
287 {
288 if (baseType == TYP_BYTE)
289 {
290 result = INS_pmaxsb;
291 }
292 else if (baseType == TYP_USHORT)
293 {
294 result = INS_pmaxuw;
295 }
296 else if (baseType == TYP_INT)
297 {
298 result = INS_pmaxsd;
299 }
300 else if (baseType == TYP_UINT)
301 {
302 result = INS_pmaxud;
303 }
304 }
305 else
306 {
307 unreached();
308 }
309 break;
310
311 case SIMDIntrinsicAbs:
312 if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
313 {
314 if (baseType == TYP_INT)
315 {
316 result = INS_pabsd;
317 }
318 else if (baseType == TYP_SHORT)
319 {
320 result = INS_pabsw;
321 }
322 else if (baseType == TYP_BYTE)
323 {
324 result = INS_pabsb;
325 }
326 }
327 break;
328
329 case SIMDIntrinsicEqual:
330 if (baseType == TYP_FLOAT)
331 {
332 result = INS_cmpps;
333 assert(ival != nullptr);
334 *ival = 0;
335 }
336 else if (baseType == TYP_DOUBLE)
337 {
338 result = INS_cmppd;
339 assert(ival != nullptr);
340 *ival = 0;
341 }
342 else if (baseType == TYP_INT || baseType == TYP_UINT)
343 {
344 result = INS_pcmpeqd;
345 }
346 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
347 {
348 result = INS_pcmpeqw;
349 }
350 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
351 {
352 result = INS_pcmpeqb;
353 }
354 else if ((baseType == TYP_ULONG || baseType == TYP_LONG) &&
355 (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
356 {
357 result = INS_pcmpeqq;
358 }
359 break;
360
361 case SIMDIntrinsicLessThan:
362 // Packed integers use > with swapped operands
363 assert(baseType != TYP_INT);
364
365 if (baseType == TYP_FLOAT)
366 {
367 result = INS_cmpps;
368 assert(ival != nullptr);
369 *ival = 1;
370 }
371 else if (baseType == TYP_DOUBLE)
372 {
373 result = INS_cmppd;
374 assert(ival != nullptr);
375 *ival = 1;
376 }
377 break;
378
379 case SIMDIntrinsicLessThanOrEqual:
380 // Packed integers use (a==b) || ( b > a) in place of a <= b.
381 assert(baseType != TYP_INT);
382
383 if (baseType == TYP_FLOAT)
384 {
385 result = INS_cmpps;
386 assert(ival != nullptr);
387 *ival = 2;
388 }
389 else if (baseType == TYP_DOUBLE)
390 {
391 result = INS_cmppd;
392 assert(ival != nullptr);
393 *ival = 2;
394 }
395 break;
396
397 case SIMDIntrinsicGreaterThan:
398 // Packed float/double use < with swapped operands
399 assert(!varTypeIsFloating(baseType));
400
401 // SSE2 supports only signed >
402 if (baseType == TYP_INT)
403 {
404 result = INS_pcmpgtd;
405 }
406 else if (baseType == TYP_SHORT)
407 {
408 result = INS_pcmpgtw;
409 }
410 else if (baseType == TYP_BYTE)
411 {
412 result = INS_pcmpgtb;
413 }
414 else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
415 {
416 result = INS_pcmpgtq;
417 }
418 break;
419
420 case SIMDIntrinsicBitwiseAnd:
421 if (baseType == TYP_FLOAT)
422 {
423 result = INS_andps;
424 }
425 else if (baseType == TYP_DOUBLE)
426 {
427 result = INS_andpd;
428 }
429 else if (varTypeIsIntegral(baseType))
430 {
431 result = INS_pand;
432 }
433 break;
434
435 case SIMDIntrinsicBitwiseAndNot:
436 if (baseType == TYP_FLOAT)
437 {
438 result = INS_andnps;
439 }
440 else if (baseType == TYP_DOUBLE)
441 {
442 result = INS_andnpd;
443 }
444 else if (baseType == TYP_INT)
445 {
446 result = INS_pandn;
447 }
448 else if (varTypeIsIntegral(baseType))
449 {
450 result = INS_pandn;
451 }
452 break;
453
454 case SIMDIntrinsicBitwiseOr:
455 if (baseType == TYP_FLOAT)
456 {
457 result = INS_orps;
458 }
459 else if (baseType == TYP_DOUBLE)
460 {
461 result = INS_orpd;
462 }
463 else if (varTypeIsIntegral(baseType))
464 {
465 result = INS_por;
466 }
467 break;
468
469 case SIMDIntrinsicBitwiseXor:
470 if (baseType == TYP_FLOAT)
471 {
472 result = INS_xorps;
473 }
474 else if (baseType == TYP_DOUBLE)
475 {
476 result = INS_xorpd;
477 }
478 else if (varTypeIsIntegral(baseType))
479 {
480 result = INS_pxor;
481 }
482 break;
483
484 case SIMDIntrinsicCast:
485 result = INS_movaps;
486 break;
487
488 case SIMDIntrinsicConvertToSingle:
489 result = INS_cvtdq2ps;
490 break;
491
492 case SIMDIntrinsicConvertToDouble:
493 assert(baseType == TYP_LONG);
494 result = INS_cvtsi2sd;
495 break;
496
497 case SIMDIntrinsicConvertToInt32:
498 assert(baseType == TYP_FLOAT);
499 result = INS_cvttps2dq;
500 break;
501
502 case SIMDIntrinsicConvertToInt64:
503 assert(baseType == TYP_DOUBLE);
504 result = INS_cvttsd2si;
505 break;
506
507 case SIMDIntrinsicNarrow:
508 // Note that for the integer types the caller must zero the upper bits of
509 // each source element, since the instructions saturate.
510 switch (baseType)
511 {
512 case TYP_INT:
513 case TYP_UINT:
514 if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
515 {
516 result = INS_packusdw;
517 }
518 else
519 {
520 result = INS_packssdw;
521 }
522 break;
523 case TYP_SHORT:
524 case TYP_USHORT:
525 result = INS_packuswb;
526 break;
527 default:
528 assert(!"Invalid baseType for SIMDIntrinsicNarrow");
529 result = INS_invalid;
530 break;
531 }
532 break;
533
534 case SIMDIntrinsicWidenLo:
535 // Some of these have multiple instruction implementations, with one instruction to widen the lo half,
536 // and another to widen the hi half.
537 switch (baseType)
538 {
539 case TYP_FLOAT:
540 result = INS_cvtps2pd;
541 break;
542 case TYP_INT:
543 case TYP_UINT:
544 result = INS_punpckldq;
545 break;
546 case TYP_SHORT:
547 case TYP_USHORT:
548 result = INS_punpcklwd;
549 break;
550 case TYP_BYTE:
551 case TYP_UBYTE:
552 result = INS_punpcklbw;
553 break;
554 default:
555 assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
556 result = INS_invalid;
557 break;
558 }
559 break;
560
561 case SIMDIntrinsicWidenHi:
562 switch (baseType)
563 {
564 case TYP_FLOAT:
565 // For this case, we actually use the same instruction.
566 result = INS_cvtps2pd;
567 break;
568 case TYP_INT:
569 case TYP_UINT:
570 result = INS_punpckhdq;
571 break;
572 case TYP_SHORT:
573 case TYP_USHORT:
574 result = INS_punpckhwd;
575 break;
576 case TYP_BYTE:
577 case TYP_UBYTE:
578 result = INS_punpckhbw;
579 break;
580 default:
581 assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
582 result = INS_invalid;
583 break;
584 }
585 break;
586
587 case SIMDIntrinsicShiftLeftInternal:
588 switch (baseType)
589 {
590 case TYP_SIMD16:
591 // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
592 result = INS_pslldq;
593 break;
594 case TYP_UINT:
595 case TYP_INT:
596 result = INS_pslld;
597 break;
598 case TYP_SHORT:
599 case TYP_USHORT:
600 result = INS_psllw;
601 break;
602 default:
603 assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
604 result = INS_invalid;
605 break;
606 }
607 break;
608
609 case SIMDIntrinsicShiftRightInternal:
610 switch (baseType)
611 {
612 case TYP_SIMD16:
613 // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
614 result = INS_psrldq;
615 break;
616 case TYP_UINT:
617 case TYP_INT:
618 result = INS_psrld;
619 break;
620 case TYP_SHORT:
621 case TYP_USHORT:
622 result = INS_psrlw;
623 break;
624 default:
625 assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
626 result = INS_invalid;
627 break;
628 }
629 break;
630
631 case SIMDIntrinsicUpperSave:
632 result = INS_vextractf128;
633 break;
634
635 case SIMDIntrinsicUpperRestore:
636 result = INS_insertps;
637 break;
638
639 default:
640 assert(!"Unsupported SIMD intrinsic");
641 unreached();
642 }
643
644 noway_assert(result != INS_invalid);
645 return result;
646}
647
648// genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
649// to target mm reg, zeroing out the upper bits if and only if specified.
650//
651// Arguments:
652// targetType the target type
653// baseType the base type of value to be moved
654// targetReg the target reg
655// srcReg the src reg
656// moveType action to be performed on target upper bits
657//
658// Return Value:
659// None
660//
661// Notes:
662// This is currently only supported for floating point types.
663//
664void CodeGen::genSIMDScalarMove(
665 var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
666{
667 assert(varTypeIsFloating(baseType));
668 switch (moveType)
669 {
670 case SMT_PreserveUpper:
671 if (srcReg != targetReg)
672 {
673 instruction ins = ins_Store(baseType);
674 if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
675 {
676 // In general, when we use a three-operands move instruction, we want to merge the src with
677 // itself. This is an exception in that we actually want the "merge" behavior, so we must
678 // specify it with all 3 operands.
679 inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
680 }
681 else
682 {
683 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
684 }
685 }
686 break;
687
688 case SMT_ZeroInitUpper:
689 if (compiler->canUseVexEncoding())
690 {
691 // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
692 // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
693 // to zero all but the lower bits.
694 unsigned int insertpsImm =
695 (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
696 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
697 }
698 else
699 {
700 if (srcReg == targetReg)
701 {
702 // There is no guarantee that upper bits of op1Reg are zero.
703 // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
704 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
705 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
706 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
707 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
708 }
709 else
710 {
711 genSIMDZero(targetType, TYP_FLOAT, targetReg);
712 inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
713 }
714 }
715 break;
716
717 case SMT_ZeroInitUpper_SrcHasUpperZeros:
718 if (srcReg != targetReg)
719 {
720 instruction ins = ins_Copy(baseType);
721 assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
722 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
723 }
724 break;
725
726 default:
727 unreached();
728 }
729}
730
731void CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg)
732{
733 // We just use `INS_xorps` instead of `getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType)`
734 // since `genSIMDZero` is used for both `System.Numerics.Vectors` and HardwareIntrinsics. Modern
735 // CPUs handle this specially in the renamer and it never hits the execution pipeline, additionally
736 // `INS_xorps` is always available (when using either the legacy or VEX encoding).
737 inst_RV_RV(INS_xorps, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
738}
739
740//------------------------------------------------------------------------
741// genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
742//
743// Arguments:
744// simdNode - The GT_SIMD node
745//
746// Return Value:
747// None.
748//
749void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
750{
751 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
752
753 GenTree* op1 = simdNode->gtGetOp1();
754 var_types baseType = simdNode->gtSIMDBaseType;
755 regNumber targetReg = simdNode->gtRegNum;
756 assert(targetReg != REG_NA);
757 var_types targetType = simdNode->TypeGet();
758 SIMDLevel level = compiler->getSIMDSupportLevel();
759 unsigned size = simdNode->gtSIMDSize;
760
761 // Should never see small int base type vectors except for zero initialization.
762 noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
763
764 instruction ins = INS_invalid;
765
766#if !defined(_TARGET_64BIT_)
767 if (op1->OperGet() == GT_LONG)
768 {
769 assert(varTypeIsLong(baseType));
770
771 GenTree* op1lo = op1->gtGetOp1();
772 GenTree* op1hi = op1->gtGetOp2();
773
774 if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0))
775 {
776 genSIMDZero(targetType, baseType, targetReg);
777 }
778 else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))
779 {
780 // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
781 ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
782 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
783 }
784 else
785 {
786 // Generate:
787 // mov_i2xmm targetReg, op1lo
788 // mov_i2xmm xmmtmp, op1hi
789 // shl xmmtmp, 4 bytes
790 // por targetReg, xmmtmp
791 // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
792 // shufpd targetReg, targetReg, 0 // move the long to all the lanes
793 // For AVX2, move it to all 4 of the 64-bit lanes using:
794 // vpbroadcastq targetReg, targetReg
795
796 instruction ins;
797
798 regNumber op1loReg = genConsumeReg(op1lo);
799 ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
800 inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
801
802 regNumber tmpReg = simdNode->GetSingleTempReg();
803
804 regNumber op1hiReg = genConsumeReg(op1hi);
805 ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
806 inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
807
808 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
809 getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
810
811 ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
812 inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
813
814 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
815 {
816 inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
817 }
818 else
819 {
820 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
821 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0);
822 }
823 }
824 }
825 else
826#endif // !defined(_TARGET_64BIT_)
827 if (op1->isContained())
828 {
829 if (op1->IsIntegralConst(0) || op1->IsFPZero())
830 {
831 genSIMDZero(targetType, baseType, targetReg);
832 }
833 else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1))
834 {
835 // case of initializing elements of vector with all 1's
836 // generate pcmpeqd reg, reg
837 ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
838 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
839 }
840 else
841 {
842 assert(level == SIMD_AVX2_Supported);
843 ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
844 if (op1->IsCnsFltOrDbl())
845 {
846 getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
847 }
848 else if (op1->OperIsLocalAddr())
849 {
850 unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
851 getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum,
852 offset);
853 }
854 else
855 {
856 unreached();
857 }
858 }
859 }
860 else if (level == SIMD_AVX2_Supported && ((size == 32) || (size == 16)))
861 {
862 regNumber srcReg = genConsumeReg(op1);
863 if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG)
864 {
865 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
866 assert(ins != INS_invalid);
867 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
868 srcReg = targetReg;
869 }
870
871 ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
872 getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
873 }
874 else
875 {
876 // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
877 // In either case we are going to use the SSE2 shuffle instruction.
878
879 regNumber op1Reg = genConsumeReg(op1);
880 unsigned shuffleControl = 0;
881
882 if (compiler->isSubRegisterSIMDType(simdNode))
883 {
884 assert(baseType == TYP_FLOAT);
885
886 // We cannot assume that upper bits of op1Reg or targetReg be zero.
887 // Therefore we need to explicitly zero out upper bits. This is
888 // essential for the shuffle operation performed below.
889 //
890 // If op1 is a float/double constant, we would have loaded it from
891 // data section using movss/sd. Similarly if op1 is a memory op we
892 // would have loaded it using movss/sd. Movss/sd when loading a xmm reg
893 // from memory would zero-out upper bits. In these cases we can
894 // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
895 // if they are not the same.
896 SIMDScalarMoveType moveType =
897 op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper;
898
899 genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType);
900
901 if (size == 8)
902 {
903 shuffleControl = 0x50;
904 }
905 else if (size == 12)
906 {
907 shuffleControl = 0x40;
908 }
909 else
910 {
911 noway_assert(!"Unexpected size for SIMD type");
912 }
913 }
914 else // Vector<T>
915 {
916 if (op1Reg != targetReg)
917 {
918 if (varTypeIsFloating(baseType))
919 {
920 ins = ins_Copy(targetType);
921 }
922 else if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG)
923 {
924 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
925 }
926
927 assert(ins != INS_invalid);
928 inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
929 }
930 }
931
932 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
933 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
934 }
935
936 genProduceReg(simdNode);
937}
938
939//-------------------------------------------------------------------------------------------
940// genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
941// a number of arguments equal to the length of the Vector.
942//
943// Arguments:
944// simdNode - The GT_SIMD node
945//
946// Return Value:
947// None.
948//
949void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
950{
951 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
952
953 // Right now this intrinsic is supported only on TYP_FLOAT vectors
954 var_types baseType = simdNode->gtSIMDBaseType;
955 noway_assert(baseType == TYP_FLOAT);
956
957 regNumber targetReg = simdNode->gtRegNum;
958 assert(targetReg != REG_NA);
959
960 var_types targetType = simdNode->TypeGet();
961
962 // Note that we cannot use targetReg before consumed all source operands. Therefore,
963 // Need an internal register to stitch together all the values into a single vector
964 // in an XMM reg.
965 regNumber vectorReg = simdNode->GetSingleTempReg();
966
967 // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size.
968 // For example in case of Vector4f we don't need to zero when using SSE2.
969 if (compiler->isSubRegisterSIMDType(simdNode))
970 {
971 genSIMDZero(targetType, baseType, vectorReg);
972 }
973
974 unsigned int baseTypeSize = genTypeSize(baseType);
975 instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
976
977 // We will first consume the list items in execution (left to right) order,
978 // and record the registers.
979 regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT];
980 unsigned initCount = 0;
981 for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
982 {
983 assert(list->OperGet() == GT_LIST);
984 GenTree* listItem = list->gtGetOp1();
985 assert(listItem->TypeGet() == baseType);
986 assert(!listItem->isContained());
987 regNumber operandReg = genConsumeReg(listItem);
988 operandRegs[initCount] = operandReg;
989 initCount++;
990 }
991
992 unsigned int offset = 0;
993 for (unsigned i = 0; i < initCount; i++)
994 {
995 // We will now construct the vector from the list items in reverse order.
996 // This allows us to efficiently stitch together a vector as follows:
997 // vectorReg = (vectorReg << offset)
998 // VectorReg[0] = listItemReg
999 // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
1000 // bits of vectorReg are not modified.
1001
1002 regNumber operandReg = operandRegs[initCount - i - 1];
1003 if (offset != 0)
1004 {
1005 getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
1006 }
1007 genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper);
1008
1009 offset += baseTypeSize;
1010 }
1011
1012 noway_assert(offset == simdNode->gtSIMDSize);
1013
1014 // Load the initialized value.
1015 if (targetReg != vectorReg)
1016 {
1017 inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
1018 }
1019 genProduceReg(simdNode);
1020}
1021
1022//----------------------------------------------------------------------------------
1023// genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
1024//
1025// Arguments:
1026// simdNode - The GT_SIMD node
1027//
1028// Return Value:
1029// None.
1030//
1031void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
1032{
1033 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast ||
1034 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAbs);
1035
1036 GenTree* op1 = simdNode->gtGetOp1();
1037 var_types baseType = simdNode->gtSIMDBaseType;
1038 regNumber targetReg = simdNode->gtRegNum;
1039 assert(targetReg != REG_NA);
1040 var_types targetType = simdNode->TypeGet();
1041
1042 regNumber op1Reg = genConsumeReg(op1);
1043 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1044 if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
1045 {
1046 inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1047 }
1048 genProduceReg(simdNode);
1049}
1050
1051//----------------------------------------------------------------------------------
1052// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
1053//
1054// Arguments:
1055// simdNode - The GT_SIMD node
1056//
1057// Return Value:
1058// None.
1059//
1060void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
1061{
1062 SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1063 assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32));
1064
1065 GenTree* op1 = simdNode->gtGetOp1();
1066 var_types baseType = simdNode->gtSIMDBaseType;
1067 regNumber targetReg = simdNode->gtRegNum;
1068 assert(targetReg != REG_NA);
1069 var_types targetType = simdNode->TypeGet();
1070
1071 regNumber op1Reg = genConsumeReg(op1);
1072 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1073 if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
1074 {
1075 regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1076 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1077 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1078 assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1079
1080 // We will generate the following:
1081 // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1082 // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1083 // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg)
1084 // vpslld tmpReg2, 16
1085 // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2)
1086 // mov tmpIntReg, 0x5300000053000000
1087 // vmovd tmpReg, tmpIntReg
1088 // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src)
1089 // vorps targetReg, tmpReg
1090 // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg)
1091 // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2)
1092 // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits)
1093 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
1094 if (targetReg != op1Reg)
1095 {
1096 inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
1097 }
1098
1099 // prepare upper 16 bits
1100 getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16);
1101
1102 // prepare lower 16 bits
1103 getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16);
1104 getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16);
1105
1106// prepare mask
1107#ifdef _TARGET_AMD64_
1108 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000);
1109 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1110#else
1111 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1112 {
1113 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000);
1114 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1115 }
1116 else
1117 {
1118 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300);
1119 inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1120 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1);
1121 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3);
1122 }
1123#endif
1124 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1125 {
1126 inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1127 }
1128 else
1129 {
1130 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1131 }
1132
1133 // convert upper 16 bits
1134 inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1135 inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1136
1137 // convert lower 16 bits
1138 inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
1139
1140 // add lower 16 bits and upper 16 bits
1141 inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
1142 }
1143 else
1144 {
1145 inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1146 }
1147 genProduceReg(simdNode);
1148}
1149
1150//----------------------------------------------------------------------------------
1151// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
1152//
1153// Arguments:
1154// intrinsicID the SIMD intrinsic ID
1155// simdType the SIMD node type
1156// baseType the base type of value to be converted
1157// tmpReg the tmp reg
1158// tmpIntReg the tmp integer reg
1159// targetReg the target reg
1160//
1161// Return Value:
1162// None.
1163//
1164void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
1165 var_types simdType,
1166 var_types baseType,
1167 regNumber tmpReg,
1168 regNumber tmpIntReg,
1169 regNumber targetReg)
1170{
1171 instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
1172 if (intrinsicID == SIMDIntrinsicConvertToDouble)
1173 {
1174 // Note that for mov_xmm2i, the int register is always in the reg2 position
1175 inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
1176 inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
1177 }
1178 else
1179 {
1180 inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
1181 inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
1182 }
1183}
1184
1185//----------------------------------------------------------------------------------
1186// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
1187//
1188// Arguments:
1189// simdNode - The GT_SIMD node
1190//
1191// Notes:
1192// There are no instructions for converting to/from 64-bit integers, so for these we
1193// do the conversion an element at a time.
1194//
1195void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
1196{
1197 SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1198 assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64));
1199
1200 GenTree* op1 = simdNode->gtGetOp1();
1201 var_types baseType = simdNode->gtSIMDBaseType;
1202 regNumber targetReg = simdNode->gtRegNum;
1203 assert(targetReg != REG_NA);
1204 var_types simdType = simdNode->TypeGet();
1205 regNumber op1Reg = genConsumeReg(op1);
1206 regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1207 regNumber tmpReg;
1208 regNumber tmpReg2;
1209 regNumber tmpReg3;
1210 SIMDLevel level = compiler->getSIMDSupportLevel();
1211
1212#ifdef _TARGET_X86_
1213 if (baseType == TYP_LONG)
1214 {
1215 tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1216 tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1217 tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1218 assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
1219 }
1220 else
1221#endif
1222 if (level == SIMD_AVX2_Supported || (baseType == TYP_ULONG))
1223 {
1224 tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1225 tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1226 tmpReg3 = REG_NA;
1227 assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1228 }
1229 else
1230 {
1231 tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1232 assert(tmpReg != op1Reg);
1233 tmpReg2 = REG_NA;
1234 tmpReg3 = REG_NA;
1235 }
1236
1237 if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
1238 {
1239 // We will generate the following
1240 // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1241 // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1242 // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg)
1243 // vpsllq tmpReg2, 32
1244 // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2)
1245 // mov tmpIntReg, 0x4530000000000000
1246 // vmovd tmpReg, tmpIntReg
1247 // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src)
1248 // vorpd targetReg, tmpReg
1249 // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg)
1250 // mov tmpIntReg, 0x4330000000000000
1251 // vmovd tmpReg, tmpIntReg
1252 // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src)
1253 // vorpd tmpReg2, tmpReg
1254 // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2)
1255 // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together)
1256 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1257 if (targetReg != op1Reg)
1258 {
1259 inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
1260 }
1261
1262 // prepare upper 32 bits
1263 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
1264
1265 // prepare lower 32 bits
1266 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
1267 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
1268
1269// prepare mask for converting upper 32 bits
1270#ifdef _TARGET_AMD64_
1271 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000);
1272 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1273#else
1274 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
1275 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1276 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1277#endif
1278 if (level == SIMD_AVX2_Supported)
1279 {
1280 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1281 }
1282 else
1283 {
1284 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1285 }
1286
1287 // convert upper 32 bits
1288 inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1289 inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1290
1291// prepare mask for converting lower 32 bits
1292#ifdef _TARGET_AMD64_
1293 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000);
1294 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1295#else
1296 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
1297 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1298 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1299#endif
1300 if (level == SIMD_AVX2_Supported)
1301 {
1302 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1303 }
1304 else
1305 {
1306 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1307 }
1308
1309 // convert lower 32 bits
1310 inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1311 inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1312
1313 // add lower 32 bits and upper 32 bits
1314 inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1315 }
1316 else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
1317 {
1318#ifdef _TARGET_AMD64_
1319 instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1320 instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1321
1322 if (level == SIMD_AVX2_Supported)
1323 {
1324 // Extract the high 16-bits
1325 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1326
1327 // Put v[3] (the high-order element) in tmpReg2 and convert it.
1328 inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1329 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1330 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1331
1332 // Shift the resulting 64-bits left.
1333 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1334
1335 // Convert v[2], in the lo bits of tmpReg.
1336 // For the convert to double, the convert preserves the upper bits in tmpReg2.
1337 // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1338 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
1339 }
1340
1341 // Put v[1] in tmpReg.
1342 inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1343 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1344
1345 // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1346 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1347
1348 // Shift the resulting 64-bits left.
1349 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1350
1351 // Convert the lo 64-bits into targetReg
1352 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
1353
1354 // Merge or copy the results (only at this point are we done with op1Reg).
1355 if (tmpReg != targetReg)
1356 {
1357 inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1358 }
1359
1360 if (level == SIMD_AVX2_Supported)
1361 {
1362 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01);
1363 }
1364#else
1365 // get the sign bit and put it in tmpReg3
1366 inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
1367 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63);
1368 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63);
1369
1370 // get the absolute value of src and put it into tmpReg2 and targetReg
1371 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1372 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
1373 getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32);
1374 inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1375 inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1376 inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
1377
1378 // prepare upper 32 bits
1379 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
1380
1381 // prepare lower 32 bits
1382 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
1383 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
1384
1385 // prepare mask for converting upper 32 bits
1386 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
1387 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1388 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1389
1390 if (level == SIMD_AVX2_Supported)
1391 {
1392 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1393 }
1394 else
1395 {
1396 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1397 }
1398
1399 // convert upper 32 bits
1400 inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1401 inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1402
1403 // prepare mask for converting lower 32 bits
1404 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
1405 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1406 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1407
1408 if (level == SIMD_AVX2_Supported)
1409 {
1410 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1411 }
1412 else
1413 {
1414 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1415 }
1416
1417 // convert lower 32 bits
1418 inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1419 inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1420
1421 // add lower 32 bits and upper 32 bits
1422 inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1423
1424 // add sign bit
1425 inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
1426#endif
1427 }
1428 else
1429 {
1430 instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1431 instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1432
1433 if (level == SIMD_AVX2_Supported)
1434 {
1435 // Extract the high 16-bits
1436 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1437
1438 // Put v[3] (the high-order element) in tmpReg2 and convert it.
1439 inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1440 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1441 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1442
1443 // Shift the resulting 64-bits left.
1444 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1445
1446 // Convert v[2], in the lo bits of tmpReg.
1447 // For the convert to double, the convert preserves the upper bits in tmpReg2.
1448 // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1449 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1450 inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1451 }
1452
1453 // Put v[1] in tmpReg.
1454 inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1455 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1456
1457 // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1458 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1459
1460 // Shift the resulting 64-bits left.
1461 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1462
1463 // Convert the lo 64-bits into targetReg
1464 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
1465
1466 // Merge or copy the results (only at this point are we done with op1Reg).
1467 assert(tmpReg != targetReg);
1468 inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1469 if (level == SIMD_AVX2_Supported)
1470 {
1471 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01);
1472 }
1473 }
1474 genProduceReg(simdNode);
1475}
1476
1477//--------------------------------------------------------------------------------
1478// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
1479//
1480// Arguments:
1481// simdNode - The GT_SIMD node
1482//
1483// Notes:
1484// This is used for the WidenHi intrinsic to extract the upper half.
1485// On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes.
1486//
1487void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
1488{
1489 var_types simdType = simdNode->TypeGet();
1490 emitAttr emitSize = emitActualTypeSize(simdType);
1491 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1492 {
1493 instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
1494 getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01);
1495 }
1496 else
1497 {
1498 instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1499 if (tgtReg != srcReg)
1500 {
1501 inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
1502 }
1503 getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8);
1504 }
1505}
1506
1507//--------------------------------------------------------------------------------
1508// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
1509//
1510// Arguments:
1511// simdNode - The GT_SIMD node
1512//
1513// Notes:
1514// The Widen intrinsics are broken into separate intrinsics for the two results.
1515//
1516void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
1517{
1518 assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) ||
1519 (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
1520
1521 GenTree* op1 = simdNode->gtGetOp1();
1522 var_types baseType = simdNode->gtSIMDBaseType;
1523 regNumber targetReg = simdNode->gtRegNum;
1524 assert(targetReg != REG_NA);
1525 var_types simdType = simdNode->TypeGet();
1526 SIMDLevel level = compiler->getSIMDSupportLevel();
1527
1528 genConsumeOperands(simdNode);
1529 regNumber op1Reg = op1->gtRegNum;
1530 regNumber srcReg = op1Reg;
1531 emitAttr emitSize = emitActualTypeSize(simdType);
1532 instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1533
1534 if (baseType == TYP_FLOAT)
1535 {
1536 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1537 {
1538 genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
1539 srcReg = targetReg;
1540 }
1541 inst_RV_RV(widenIns, targetReg, srcReg, simdType);
1542 }
1543 else
1544 {
1545 // We will generate the following on AVX:
1546 // vpermq targetReg, op1Reg, 0xd4|0xe8
1547 // vpxor tmpReg, tmpReg
1548 // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed)
1549 // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg
1550 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1551 assert(tmpReg != op1Reg);
1552
1553 if (level == SIMD_AVX2_Supported)
1554 {
1555 // permute op1Reg and put it into targetReg
1556 unsigned ival = 0xd4;
1557 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1558 {
1559 ival = 0xe8;
1560 }
1561 getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
1562 }
1563 else if (targetReg != op1Reg)
1564 {
1565 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1566 }
1567
1568 genSIMDZero(simdType, baseType, tmpReg);
1569 if (!varTypeIsUnsigned(baseType))
1570 {
1571 instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
1572 inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
1573 }
1574 inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
1575 }
1576 genProduceReg(simdNode);
1577}
1578
1579//--------------------------------------------------------------------------------
1580// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
1581//
1582// Arguments:
1583// simdNode - The GT_SIMD node
1584//
1585// Notes:
1586// This intrinsic takes two arguments. The first operand is narrowed to produce the
1587// lower elements of the results, and the second operand produces the high elements.
1588//
1589void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
1590{
1591 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
1592
1593 GenTree* op1 = simdNode->gtGetOp1();
1594 GenTree* op2 = simdNode->gtGetOp2();
1595 var_types baseType = simdNode->gtSIMDBaseType;
1596 regNumber targetReg = simdNode->gtRegNum;
1597 assert(targetReg != REG_NA);
1598 var_types simdType = simdNode->TypeGet();
1599 emitAttr emitSize = emitTypeSize(simdType);
1600 SIMDLevel level = compiler->getSIMDSupportLevel();
1601
1602 genConsumeOperands(simdNode);
1603 regNumber op1Reg = op1->gtRegNum;
1604 regNumber op2Reg = op2->gtRegNum;
1605 if (baseType == TYP_DOUBLE)
1606 {
1607 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1608
1609 inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
1610 inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
1611 // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
1612 if (level == SIMD_AVX2_Supported)
1613 {
1614 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
1615 }
1616 else
1617 {
1618 inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
1619 }
1620 }
1621 else if (varTypeIsLong(baseType))
1622 {
1623 if (level == SIMD_AVX2_Supported)
1624 {
1625 // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
1626 // We will generate the following:
1627 // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg)
1628 // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2)
1629 // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg)
1630 // mov tmpReg2, op1Reg
1631 // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2)
1632 // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg
1633 // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg
1634 // punpcklqdq tgtReg, tmpReg
1635 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1636 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1637 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1638 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
1639 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01);
1640 inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
1641 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
1642 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
1643 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
1644 inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
1645 }
1646 else
1647 {
1648 // We will generate the following:
1649 // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
1650 // psrldq targetReg, 8 (shift them right to get zeros in the high elements)
1651 // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements)
1652 // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements)
1653 // por targetReg, tmpReg
1654 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1655 instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1656 instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1657 emitAttr emitSize = emitTypeSize(simdType);
1658
1659 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
1660 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8);
1661 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
1662 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8);
1663 inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
1664 }
1665 }
1666 else
1667 {
1668 // We will generate the following:
1669 // mov targetReg, op1Reg
1670 // mov tmpReg, op2Reg
1671 // psll? targetReg, shiftCount
1672 // pslr? targetReg, shiftCount
1673 // psll? tmpReg, shiftCount
1674 // pslr? tmpReg, shiftCount
1675 // <pack> targetReg, tmpReg
1676 // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
1677 // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
1678 // get CLR type semantics; otherwise it will saturate).
1679 //
1680 int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2);
1681 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1682 instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
1683 instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1684
1685 if (level == SIMD_AVX2_Supported)
1686 {
1687 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1688 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1689
1690 // The AVX instructions generally operate on "lanes", so we have to permute the
1691 // inputs so that the destination register has the low 128-bit halves of the two
1692 // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
1693 getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20);
1694 getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31);
1695 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
1696 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
1697 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1698 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
1699 inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
1700 }
1701 else
1702 {
1703 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1704
1705 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1706 inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
1707
1708 instruction tmpShiftRight = shiftRightIns;
1709 if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported)
1710 {
1711 tmpShiftRight = INS_psrad;
1712 }
1713
1714 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
1715 getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
1716 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1717 getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
1718 inst_RV_RV(ins, targetReg, tmpReg, simdType);
1719 }
1720 }
1721 genProduceReg(simdNode);
1722}
1723
1724//--------------------------------------------------------------------------------
1725// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
1726// add, sub, mul, bit-wise And, AndNot and Or.
1727//
1728// Arguments:
1729// simdNode - The GT_SIMD node
1730//
1731// Return Value:
1732// None.
1733//
1734void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
1735{
1736 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
1737 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
1738 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
1739 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
1740 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr ||
1741 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin ||
1742 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax);
1743
1744 GenTree* op1 = simdNode->gtGetOp1();
1745 GenTree* op2 = simdNode->gtGetOp2();
1746 var_types baseType = simdNode->gtSIMDBaseType;
1747 regNumber targetReg = simdNode->gtRegNum;
1748 assert(targetReg != REG_NA);
1749 var_types targetType = simdNode->TypeGet();
1750 SIMDLevel level = compiler->getSIMDSupportLevel();
1751
1752 genConsumeOperands(simdNode);
1753 regNumber op1Reg = op1->gtRegNum;
1754 regNumber op2Reg = op2->gtRegNum;
1755 regNumber otherReg = op2Reg;
1756
1757 // Vector<Int>.Mul:
1758 // SSE2 doesn't have an instruction to perform this operation directly
1759 // whereas SSE4.1 does (pmulld). This is special cased and computed
1760 // as follows.
1761 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported)
1762 {
1763 // We need a temporary register that is NOT the same as the target,
1764 // and we MAY need another.
1765 regNumber tmpReg = simdNode->ExtractTempReg();
1766 regNumber tmpReg2 = simdNode->GetSingleTempReg();
1767
1768 // The register allocator guarantees the following conditions:
1769 // - the only registers that may be the same among op1Reg, op2Reg, tmpReg
1770 // and tmpReg2 are op1Reg and op2Reg.
1771 // Let's be extra-careful and assert that now.
1772 assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
1773 (tmpReg != tmpReg2));
1774
1775 // We will start by setting things up so that:
1776 // - We have op1 in op1Reg and targetReg, and they are different registers.
1777 // - We have op2 in op2Reg and tmpReg
1778 // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
1779 // OR they are the targetReg that will be produced.
1780 // (Note that in the code we generate below op1Reg and op2Reg are never written.)
1781 // We will copy things as necessary to ensure that this is the case.
1782 // Note that we can swap op1 and op2, since multiplication is commutative.
1783 // We will not modify the values in op1Reg and op2Reg.
1784 // (Though note that if either op1 or op2 is the same as targetReg, we will make
1785 // a copy and use that copy as the input register. In that case we WILL modify
1786 // the original value in the register, but will wind up with the result in targetReg
1787 // in the end, as expected.)
1788
1789 // First, we need a tmpReg that is NOT the same as targetReg.
1790 // Note that if we have another reg that is the same as targetReg,
1791 // we can use tmpReg2 for that case, as we will not have hit this case.
1792 if (tmpReg == targetReg)
1793 {
1794 tmpReg = tmpReg2;
1795 }
1796
1797 if (op2Reg == targetReg)
1798 {
1799 // We will swap the operands.
1800 // Since the code below only deals with registers, this now becomes the case where
1801 // op1Reg == targetReg.
1802 op2Reg = op1Reg;
1803 op1Reg = targetReg;
1804 }
1805 if (op1Reg == targetReg)
1806 {
1807 // Copy op1, and make tmpReg2 the new op1Reg.
1808 // Note that those regs can't be the same, as we asserted above.
1809 // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
1810 // the "tmpReg == targetReg" case.
1811 inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
1812 op1Reg = tmpReg2;
1813 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1814 // However, we have one more case to worry about: what if op2Reg is also targetReg
1815 // (i.e. we have the same operand as op1 and op2)?
1816 // In that case we will set op2Reg to the same register as op1Reg.
1817 if (op2Reg == targetReg)
1818 {
1819 op2Reg = tmpReg2;
1820 }
1821 }
1822 else
1823 {
1824 // Copy op1 to targetReg and op2 to tmpReg.
1825 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1826 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1827 }
1828 // Let's assert that things are as we expect.
1829 // - We have op1 in op1Reg and targetReg, and they are different registers.
1830 assert(op1Reg != targetReg);
1831 // - We have op2 in op2Reg and tmpReg, and they are different registers.
1832 assert(op2Reg != tmpReg);
1833 // - Either we are going to leave op1's reg unmodified, or it is the targetReg.
1834 assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg));
1835 // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
1836 assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg));
1837
1838 // Now we can generate the code.
1839
1840 // targetReg = op1 >> 4-bytes (op1 is already in targetReg)
1841 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4);
1842
1843 // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg)
1844 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4);
1845
1846 // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
1847 // tmpReg[63:0] = op1[1] * op2[1]
1848 // tmpReg[127:64] = op1[3] * op2[3]
1849 inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
1850
1851 // Extract first and third double word results from tmpReg
1852 // tmpReg = shuffle(0,0,2,0) of tmpReg
1853 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
1854
1855 // targetReg[63:0] = op1[0] * op2[0]
1856 // targetReg[127:64] = op1[2] * op2[2]
1857 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1858 inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
1859
1860 // Extract first and third double word results from targetReg
1861 // targetReg = shuffle(0,0,2,0) of targetReg
1862 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
1863
1864 // pack the results into a single vector
1865 inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1866 }
1867 else
1868 {
1869 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1870
1871 // Currently AVX doesn't support integer.
1872 // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
1873 if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
1874 !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
1875 {
1876 inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
1877 }
1878 else
1879 {
1880 if (op2Reg == targetReg)
1881 {
1882 otherReg = op1Reg;
1883 }
1884 else if (op1Reg != targetReg)
1885 {
1886 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1887 }
1888
1889 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1890 }
1891 }
1892
1893 // Vector2/3 div: since the top-most elements will be zero, we end up
1894 // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
1895 // top-most elements to zero. This is achieved by left logical shift followed
1896 // by right logical shift of targetReg.
1897 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
1898 {
1899 // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
1900 unsigned shiftCount = 16 - simdNode->gtSIMDSize;
1901 assert(shiftCount != 0);
1902 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1903 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1904 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1905 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1906 }
1907
1908 genProduceReg(simdNode);
1909}
1910
1911//--------------------------------------------------------------------------------
1912// genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
1913// <, <=, >, >= and ==
1914//
1915// Arguments:
1916// simdNode - The GT_SIMD node
1917//
1918// Return Value:
1919// None.
1920//
1921void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
1922{
1923 GenTree* op1 = simdNode->gtGetOp1();
1924 GenTree* op2 = simdNode->gtGetOp2();
1925 var_types baseType = simdNode->gtSIMDBaseType;
1926 regNumber targetReg = simdNode->gtRegNum;
1927 var_types targetType = simdNode->TypeGet();
1928 SIMDLevel level = compiler->getSIMDSupportLevel();
1929
1930 genConsumeOperands(simdNode);
1931 regNumber op1Reg = op1->gtRegNum;
1932 regNumber op2Reg = op2->gtRegNum;
1933 regNumber otherReg = op2Reg;
1934
1935 switch (simdNode->gtSIMDIntrinsicID)
1936 {
1937 case SIMDIntrinsicEqual:
1938 case SIMDIntrinsicGreaterThan:
1939 {
1940 assert(targetReg != REG_NA);
1941
1942#ifdef DEBUG
1943 // SSE2: vector<(u)long> relational op should be implemented in terms of
1944 // TYP_INT comparison operations
1945 if (baseType == TYP_LONG || baseType == TYP_ULONG)
1946 {
1947 assert(level >= SIMD_SSE4_Supported);
1948 }
1949#endif
1950
1951 // Greater-than: Floating point vectors use "<" with swapped operands
1952 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
1953 {
1954 assert(!varTypeIsFloating(baseType));
1955 }
1956
1957 unsigned ival = 0;
1958 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1959
1960 // targetReg = op1reg > op2reg
1961 // Therefore, we can optimize if op1Reg == targetReg
1962 otherReg = op2Reg;
1963 if (op1Reg != targetReg)
1964 {
1965 if (op2Reg == targetReg)
1966 {
1967 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
1968 otherReg = op1Reg;
1969 }
1970 else
1971 {
1972 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1973 }
1974 }
1975
1976 if (varTypeIsFloating(baseType))
1977 {
1978 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival);
1979 }
1980 else
1981 {
1982 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1983 }
1984 }
1985 break;
1986
1987 case SIMDIntrinsicLessThan:
1988 case SIMDIntrinsicLessThanOrEqual:
1989 {
1990 assert(targetReg != REG_NA);
1991
1992 // Int vectors use ">" and ">=" with swapped operands
1993 assert(varTypeIsFloating(baseType));
1994
1995 // Get the instruction opcode for compare operation
1996 unsigned ival;
1997 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1998
1999 // targetReg = op1reg RelOp op2reg
2000 // Thefore, we can optimize if op1Reg == targetReg
2001 if (op1Reg != targetReg)
2002 {
2003 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2004 }
2005
2006 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
2007 }
2008 break;
2009
2010 // (In)Equality that produces bool result instead of a bit vector
2011 case SIMDIntrinsicOpEquality:
2012 case SIMDIntrinsicOpInEquality:
2013 {
2014 // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC.
2015 assert(targetReg == REG_NA);
2016
2017 var_types simdType = op1->TypeGet();
2018 // TODO-1stClassStructs: Temporary to minimize asmDiffs
2019 if (simdType == TYP_DOUBLE)
2020 {
2021 simdType = TYP_SIMD8;
2022 }
2023
2024 // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
2025 // since both the operands will be in XMM registers.
2026 if (simdType == TYP_SIMD12)
2027 {
2028 simdType = TYP_SIMD16;
2029 }
2030
2031 // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest.
2032 if (op2->isContained())
2033 {
2034 assert((compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0));
2035 inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
2036 }
2037 else
2038 {
2039 // We need one additional SIMD register to store the result of the SIMD compare.
2040 regNumber tmpReg1 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
2041
2042 // tmpReg1 = (op1Reg == op2Reg)
2043 // Call this value of tmpReg1 as 'compResult' for further reference below.
2044 regNumber otherReg = op2Reg;
2045 if (tmpReg1 != op2Reg)
2046 {
2047 if (tmpReg1 != op1Reg)
2048 {
2049 inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
2050 }
2051 }
2052 else
2053 {
2054 otherReg = op1Reg;
2055 }
2056
2057 // For all integer types we can use TYP_INT comparison.
2058 unsigned ival = 0;
2059 instruction ins =
2060 getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
2061
2062 if (varTypeIsFloating(baseType))
2063 {
2064 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
2065 }
2066 else
2067 {
2068 inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
2069 }
2070
2071 regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT);
2072 inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
2073 // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
2074 // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
2075 // subset of each component's ones/zeroes. In the end we need to know if the result is
2076 // "all ones" where the number of ones is given by the vector byte size, not by the
2077 // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
2078 // for SSE registers we need to compare to 0x0000FFFF.
2079 // The SIMD12 case is handled specially, because we can't rely on the upper bytes being
2080 // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
2081 // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
2082 // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
2083 // encoding instead of 83F8FF.
2084 ssize_t mask;
2085 if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
2086 {
2087 mask = 0x00000FFF;
2088 getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
2089 }
2090 else if (emitActualTypeSize(simdType) == 32)
2091 {
2092 mask = -1;
2093 }
2094 else
2095 {
2096 mask = 0x0000FFFF;
2097 }
2098 getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
2099 }
2100 }
2101 break;
2102
2103 default:
2104 noway_assert(!"Unimplemented SIMD relational operation.");
2105 unreached();
2106 }
2107
2108 genProduceReg(simdNode);
2109}
2110
2111//--------------------------------------------------------------------------------
2112// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
2113//
2114// Arguments:
2115// simdNode - The GT_SIMD node
2116//
2117// Return Value:
2118// None.
2119//
2120void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
2121{
2122 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
2123
2124 GenTree* op1 = simdNode->gtGetOp1();
2125 GenTree* op2 = simdNode->gtGetOp2();
2126 var_types baseType = simdNode->gtSIMDBaseType;
2127 var_types simdType = op1->TypeGet();
2128 // TODO-1stClassStructs: Temporary to minimize asmDiffs
2129 if (simdType == TYP_DOUBLE)
2130 {
2131 simdType = TYP_SIMD8;
2132 }
2133 var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
2134 regNumber targetReg = simdNode->gtRegNum;
2135 assert(targetReg != REG_NA);
2136
2137 var_types targetType = simdNode->TypeGet();
2138 assert(targetType == baseType);
2139
2140 genConsumeOperands(simdNode);
2141 regNumber op1Reg = op1->gtRegNum;
2142 regNumber op2Reg = op2->gtRegNum;
2143 regNumber tmpReg1 = REG_NA;
2144 regNumber tmpReg2 = REG_NA;
2145
2146 SIMDLevel level = compiler->getSIMDSupportLevel();
2147
2148 // Dot product intrinsic is supported only on float/double vectors
2149 // and 32-byte int vectors on AVX.
2150 //
2151 // Float/Double Vectors:
2152 // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register
2153 // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or
2154 // smaller on AVX, then we don't need a tmpReg.
2155 //
2156 // 32-byte integer vector on AVX: we need two additional Xmm registers
2157 // different from targetReg as scratch.
2158 //
2159 // 16-byte integer vector on SSE4: we need one additional Xmm register
2160 // different from targetReg as scratch.
2161 if (varTypeIsFloating(baseType))
2162 {
2163 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32))
2164 {
2165 tmpReg1 = simdNode->GetSingleTempReg();
2166 assert(tmpReg1 != targetReg);
2167 }
2168 else
2169 {
2170 assert(simdNode->AvailableTempRegCount() == 0);
2171 }
2172 }
2173 else
2174 {
2175 assert(baseType == TYP_INT);
2176 assert(level >= SIMD_SSE4_Supported);
2177
2178 if (level == SIMD_SSE4_Supported)
2179 {
2180 tmpReg1 = simdNode->GetSingleTempReg();
2181 }
2182 else
2183 {
2184 tmpReg1 = simdNode->ExtractTempReg();
2185 tmpReg2 = simdNode->GetSingleTempReg();
2186 }
2187 }
2188
2189 if (level == SIMD_SSE2_Supported)
2190 {
2191 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
2192 if (op1Reg == targetReg)
2193 {
2194 // Best case
2195 // nothing to do, we have registers in the right place
2196 }
2197 else if (op2Reg == targetReg)
2198 {
2199 op2Reg = op1Reg;
2200 }
2201 else
2202 {
2203 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2204 }
2205
2206 // DotProduct(v1, v2)
2207 // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
2208 if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
2209 {
2210 assert(baseType == TYP_FLOAT);
2211 // v0 = v1 * v2
2212 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2213 // // position
2214 // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper
2215 // // bits
2216 // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1)
2217 // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2)
2218 // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2)
2219 //
2220 inst_RV_RV(INS_mulps, targetReg, op2Reg);
2221 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2222 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
2223 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2224 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
2225 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2226 }
2227 else if (baseType == TYP_FLOAT)
2228 {
2229 // v0 = v1 * v2
2230 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2231 // // position
2232 // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1)
2233 // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
2234 // tmp = v0
2235 // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2)
2236 // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
2237 // // Essentially horizontal addition of all elements.
2238 // // We could achieve the same using SSEv3 instruction
2239 // // HADDPS.
2240 //
2241 inst_RV_RV(INS_mulps, targetReg, op2Reg);
2242 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2243 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
2244 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2245 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2246 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
2247 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2248 }
2249 else
2250 {
2251 assert(baseType == TYP_DOUBLE);
2252
2253 // v0 = v1 * v2
2254 // tmp = v0 // v0 = (1, 0) - each element is given by its position
2255 // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
2256 // v0 = v0 + tmp // v0 = (1+0, 0+1)
2257 inst_RV_RV(INS_mulpd, targetReg, op2Reg);
2258 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2259 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01);
2260 inst_RV_RV(INS_addpd, targetReg, tmpReg1);
2261 }
2262 }
2263 else
2264 {
2265 assert(level >= SIMD_SSE4_Supported);
2266
2267 if (varTypeIsFloating(baseType))
2268 {
2269 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
2270 // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
2271 // use the 3-op form, so that we can avoid these copies.
2272 // TODO-CQ: Add inst_RV_RV_RV_IV().
2273 if (op1Reg == targetReg)
2274 {
2275 // Best case
2276 // nothing to do, we have registers in the right place
2277 }
2278 else if (op2Reg == targetReg)
2279 {
2280 op2Reg = op1Reg;
2281 }
2282 else
2283 {
2284 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2285 }
2286
2287 emitAttr emitSize = emitActualTypeSize(simdEvalType);
2288 if (baseType == TYP_FLOAT)
2289 {
2290 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
2291 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2292 unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
2293 inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
2294 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
2295 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2296 // If this is TYP_SIMD32, we need to combine the lower & upper results.
2297 if (simdEvalType == TYP_SIMD32)
2298 {
2299 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
2300 inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2301 }
2302 }
2303 else if (baseType == TYP_DOUBLE)
2304 {
2305 if (simdEvalType == TYP_SIMD32)
2306 {
2307 // targetReg = targetReg * op2Reg
2308 // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
2309 // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
2310 // targetReg = targetReg + tmpReg1
2311 inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
2312 inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
2313 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
2314 inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2315 }
2316 else
2317 {
2318 // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
2319 // dppd directly.
2320 assert(level == SIMD_SSE4_Supported);
2321 inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31);
2322 }
2323 }
2324 }
2325 else
2326 {
2327 // Dot product of 32-byte int vector on SSE4/AVX.
2328 assert(baseType == TYP_INT);
2329 assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32);
2330
2331#ifdef DEBUG
2332 // SSE4: We need 1 scratch register.
2333 // AVX2: We need 2 scratch registers.
2334 if (simdEvalType == TYP_SIMD16)
2335 {
2336 assert(tmpReg1 != REG_NA);
2337 }
2338 else
2339 {
2340 assert(tmpReg1 != REG_NA);
2341 assert(tmpReg2 != REG_NA);
2342 }
2343#endif
2344
2345 // tmpReg1 = op1 * op2
2346 if (level == SIMD_AVX2_Supported)
2347 {
2348 // On AVX take advantage 3 operand form of pmulld
2349 inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType));
2350 }
2351 else
2352 {
2353 inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType);
2354 inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType);
2355 }
2356
2357 if (simdEvalType == TYP_SIMD32)
2358 {
2359 // tmpReg2[127..0] = Upper 128-bits of tmpReg1
2360 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
2361
2362 // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0]
2363 // This will compute
2364 // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4]
2365 // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5]
2366 // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6]
2367 // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7]
2368 inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE);
2369 }
2370
2371 // This horizontal add will compute
2372 //
2373 // TYP_SIMD16:
2374 // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1]
2375 // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4]
2376 //
2377 // TYP_SIMD32:
2378 // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5]
2379 // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7]
2380 inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2381
2382 // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1]
2383 inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2384
2385 // TargetReg = integer result from tmpReg1
2386 // (Note that for mov_xmm2i, the int register is always in the reg2 position)
2387 inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
2388 }
2389 }
2390
2391 genProduceReg(simdNode);
2392}
2393
2394//------------------------------------------------------------------------------------
2395// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
2396//
2397// Arguments:
2398// simdNode - The GT_SIMD node
2399//
2400// Return Value:
2401// None.
2402//
2403void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
2404{
2405 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
2406
2407 GenTree* op1 = simdNode->gtGetOp1();
2408 GenTree* op2 = simdNode->gtGetOp2();
2409 var_types simdType = op1->TypeGet();
2410 assert(varTypeIsSIMD(simdType));
2411
2412 // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
2413 // since it is in XMM register.
2414 if (simdType == TYP_SIMD12)
2415 {
2416 simdType = TYP_SIMD16;
2417 }
2418
2419 var_types baseType = simdNode->gtSIMDBaseType;
2420 regNumber targetReg = simdNode->gtRegNum;
2421 assert(targetReg != REG_NA);
2422 var_types targetType = simdNode->TypeGet();
2423 assert(targetType == genActualType(baseType));
2424
2425 // GetItem has 2 operands:
2426 // - the source of SIMD type (op1)
2427 // - the index of the value to be returned.
2428 genConsumeOperands(simdNode);
2429 regNumber srcReg = op1->gtRegNum;
2430
2431 // Optimize the case of op1 is in memory and trying to access ith element.
2432 if (!op1->isUsedFromReg())
2433 {
2434 assert(op1->isContained());
2435
2436 regNumber baseReg;
2437 regNumber indexReg;
2438 int offset = 0;
2439
2440 if (op1->OperIsLocal())
2441 {
2442 // There are three parts to the total offset here:
2443 // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}.
2444 bool isEBPbased;
2445 unsigned varNum = op1->gtLclVarCommon.gtLclNum;
2446 offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
2447 if (op1->OperGet() == GT_LCL_FLD)
2448 {
2449 offset += op1->gtLclFld.gtLclOffs;
2450 }
2451 baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
2452 }
2453 else
2454 {
2455 // Require GT_IND addr to be not contained.
2456 assert(op1->OperGet() == GT_IND);
2457
2458 GenTree* addr = op1->AsIndir()->Addr();
2459 assert(!addr->isContained());
2460 baseReg = addr->gtRegNum;
2461 }
2462
2463 if (op2->isContainedIntOrIImmed())
2464 {
2465 indexReg = REG_NA;
2466 offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType);
2467 }
2468 else
2469 {
2470 indexReg = op2->gtRegNum;
2471 assert(genIsValidIntReg(indexReg));
2472 }
2473
2474 // Now, load the desired element.
2475 getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2476 emitTypeSize(baseType), // Of the vector baseType
2477 targetReg, // To targetReg
2478 baseReg, // Base Reg
2479 indexReg, // Indexed
2480 genTypeSize(baseType), // by the size of the baseType
2481 offset);
2482 genProduceReg(simdNode);
2483 return;
2484 }
2485
2486 // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
2487 // For the non-constant case, we will use the SIMD temp location to store the vector, and
2488 // the load the desired element.
2489 // The range check will already have been performed, so at this point we know we have an index
2490 // within the bounds of the vector.
2491 if (!op2->IsCnsIntOrI())
2492 {
2493 unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
2494 noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
2495 bool isEBPbased;
2496 unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
2497 regNumber indexReg = op2->gtRegNum;
2498
2499 // Store the vector to the temp location.
2500 getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
2501 emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
2502
2503 // Now, load the desired element.
2504 getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2505 emitTypeSize(baseType), // Of the vector baseType
2506 targetReg, // To targetReg
2507 (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
2508 indexReg, // Indexed
2509 genTypeSize(baseType), // by the size of the baseType
2510 offs);
2511 genProduceReg(simdNode);
2512 return;
2513 }
2514
2515 noway_assert(op2->isContained());
2516 noway_assert(op2->IsCnsIntOrI());
2517 unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal;
2518 unsigned int byteShiftCnt = index * genTypeSize(baseType);
2519
2520 // In general we shouldn't have an index greater than or equal to the length of the vector.
2521 // However, if we have an out-of-range access, under minOpts it will not be optimized
2522 // away. The code will throw before we reach this point, but we still need to generate
2523 // code. In that case, we will simply mask off the upper bits.
2524 if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
2525 {
2526 byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1);
2527 index = byteShiftCnt / genTypeSize(baseType);
2528 }
2529
2530 regNumber tmpReg = REG_NA;
2531 if (simdNode->AvailableTempRegCount() != 0)
2532 {
2533 tmpReg = simdNode->GetSingleTempReg();
2534 }
2535 else
2536 {
2537 assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) ||
2538 (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16)));
2539 }
2540
2541 if (byteShiftCnt >= 16)
2542 {
2543 assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2544 byteShiftCnt -= 16;
2545 regNumber newSrcReg;
2546 if (varTypeIsFloating(baseType))
2547 {
2548 newSrcReg = targetReg;
2549 }
2550 else
2551 {
2552 // Integer types
2553 assert(tmpReg != REG_NA);
2554 newSrcReg = tmpReg;
2555 }
2556 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01);
2557
2558 srcReg = newSrcReg;
2559 }
2560
2561 // Generate the following sequence:
2562 // 1) baseType is floating point
2563 // movaps targetReg, srcReg
2564 // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
2565 //
2566 // 2) baseType is not floating point
2567 // movaps tmpReg, srcReg <-- not generated if accessing zero'th element
2568 // OR if tmpReg == srcReg
2569 // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
2570 // mov_xmm2i targetReg, tmpReg
2571 if (varTypeIsFloating(baseType))
2572 {
2573 if (targetReg != srcReg)
2574 {
2575 inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
2576 }
2577
2578 if (byteShiftCnt != 0)
2579 {
2580 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2581 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
2582 }
2583 }
2584 else
2585 {
2586 if (varTypeIsSmallInt(baseType))
2587 {
2588 // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
2589 // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
2590 // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
2591 // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
2592
2593 unsigned baseSize = genTypeSize(baseType);
2594 if (baseSize == 1)
2595 {
2596 index /= 2;
2597 }
2598 // We actually want index % 8 for the AVX case (for SSE it will never be > 8).
2599 // Note that this doesn't matter functionally, because the instruction uses just the
2600 // low 3 bits of index, but it's better to use the right value.
2601 if (index > 8)
2602 {
2603 assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2604 index -= 8;
2605 }
2606
2607 getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
2608
2609 bool ZeroOrSignExtnReqd = true;
2610 if (baseSize == 1)
2611 {
2612 if ((op2->gtIntCon.gtIconVal % 2) == 1)
2613 {
2614 // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
2615 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
2616
2617 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
2618 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2619 }
2620 // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
2621 }
2622 else
2623 {
2624 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
2625 assert(baseSize == 2);
2626 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2627 }
2628
2629 if (ZeroOrSignExtnReqd)
2630 {
2631 // Zero/sign extend the byte/short to 32-bits
2632 inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
2633 }
2634 }
2635 else
2636 {
2637 // We need a temp xmm register if the baseType is not floating point and
2638 // accessing non-zero'th element.
2639 instruction ins;
2640
2641 if (byteShiftCnt != 0)
2642 {
2643 assert(tmpReg != REG_NA);
2644
2645 if (tmpReg != srcReg)
2646 {
2647 inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
2648 }
2649
2650 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2651 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
2652 }
2653 else
2654 {
2655 tmpReg = srcReg;
2656 }
2657
2658 assert(tmpReg != REG_NA);
2659 ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
2660 // (Note that for mov_xmm2i, the int register is always in the reg2 position.)
2661 inst_RV_RV(ins, tmpReg, targetReg, baseType);
2662 }
2663 }
2664
2665 genProduceReg(simdNode);
2666}
2667
2668//------------------------------------------------------------------------------------
2669// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
2670//
2671// Arguments:
2672// simdNode - The GT_SIMD node
2673//
2674// Return Value:
2675// None.
2676//
2677// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
2678//
2679void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
2680{
2681 // Determine index based on intrinsic ID
2682 int index = -1;
2683 switch (simdNode->gtSIMDIntrinsicID)
2684 {
2685 case SIMDIntrinsicSetX:
2686 index = 0;
2687 break;
2688 case SIMDIntrinsicSetY:
2689 index = 1;
2690 break;
2691 case SIMDIntrinsicSetZ:
2692 index = 2;
2693 break;
2694 case SIMDIntrinsicSetW:
2695 index = 3;
2696 break;
2697
2698 default:
2699 unreached();
2700 }
2701 assert(index != -1);
2702
2703 // op1 is the SIMD vector
2704 // op2 is the value to be set
2705 GenTree* op1 = simdNode->gtGetOp1();
2706 GenTree* op2 = simdNode->gtGetOp2();
2707
2708 var_types baseType = simdNode->gtSIMDBaseType;
2709 regNumber targetReg = simdNode->gtRegNum;
2710 assert(targetReg != REG_NA);
2711 var_types targetType = simdNode->TypeGet();
2712 assert(varTypeIsSIMD(targetType));
2713
2714 // the following assert must hold.
2715 // supported only on vector2f/3f/4f right now
2716 noway_assert(baseType == TYP_FLOAT);
2717 assert(op2->TypeGet() == baseType);
2718 assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));
2719
2720 genConsumeOperands(simdNode);
2721 regNumber op1Reg = op1->gtRegNum;
2722 regNumber op2Reg = op2->gtRegNum;
2723
2724 // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
2725 if (targetReg != op1Reg)
2726 {
2727 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2728 }
2729
2730 // Right now this intrinsic is supported only for float base type vectors.
2731 // If in future need to support on other base type vectors, the below
2732 // logic needs modification.
2733 noway_assert(baseType == TYP_FLOAT);
2734
2735 if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2736 {
2737 // We need one additional int register as scratch
2738 regNumber tmpReg = simdNode->GetSingleTempReg();
2739 assert(genIsValidIntReg(tmpReg));
2740
2741 // Move the value from xmm reg to an int reg
2742 instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
2743 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
2744 inst_RV_RV(ins, op2Reg, tmpReg, baseType);
2745
2746 // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
2747 // since every float has two 16-bit words.
2748 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index);
2749
2750 // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
2751 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
2752 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1);
2753 }
2754 else
2755 {
2756 unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index));
2757 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
2758 }
2759
2760 genProduceReg(simdNode);
2761}
2762
2763//------------------------------------------------------------------------
2764// genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
2765//
2766// Arguments:
2767// simdNode - The GT_SIMD node
2768//
2769// Return Value:
2770// None.
2771//
2772void CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
2773{
2774 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
2775 noway_assert(compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported);
2776
2777 GenTree* op1 = simdNode->gtGetOp1();
2778 GenTree* op2 = simdNode->gtGetOp2();
2779 assert(op2->isContained());
2780 assert(op2->IsCnsIntOrI());
2781 int shuffleControl = (int)op2->AsIntConCommon()->IconValue();
2782 var_types baseType = simdNode->gtSIMDBaseType;
2783 var_types targetType = simdNode->TypeGet();
2784 regNumber targetReg = simdNode->gtRegNum;
2785 assert(targetReg != REG_NA);
2786
2787 regNumber op1Reg = genConsumeReg(op1);
2788 if (targetReg != op1Reg)
2789 {
2790 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2791 }
2792
2793 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
2794 getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
2795 genProduceReg(simdNode);
2796}
2797
2798//-----------------------------------------------------------------------------
2799// genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
2800// Since Vector3 is not a hardware supported write size, it is performed
2801// as two writes: 8 byte followed by 4-byte.
2802//
2803// Arguments:
2804// treeNode - tree node that is attempting to store indirect
2805//
2806//
2807// Return Value:
2808// None.
2809//
2810void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
2811{
2812 assert(treeNode->OperGet() == GT_STOREIND);
2813
2814 GenTree* addr = treeNode->gtOp.gtOp1;
2815 GenTree* data = treeNode->gtOp.gtOp2;
2816
2817 // addr and data should not be contained.
2818 assert(!data->isContained());
2819 assert(!addr->isContained());
2820
2821#ifdef DEBUG
2822 // Should not require a write barrier
2823 GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
2824 assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
2825#endif
2826
2827 // Need an addtional Xmm register to extract upper 4 bytes from data.
2828 regNumber tmpReg = treeNode->GetSingleTempReg();
2829
2830 genConsumeOperands(treeNode->AsOp());
2831
2832 // 8-byte write
2833 getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);
2834
2835 // Extract upper 4-bytes from data
2836 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);
2837
2838 // 4-byte write
2839 getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
2840}
2841
2842//-----------------------------------------------------------------------------
2843// genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
2844// Since Vector3 is not a hardware supported write size, it is performed
2845// as two loads: 8 byte followed by 4-byte.
2846//
2847// Arguments:
2848// treeNode - tree node of GT_IND
2849//
2850//
2851// Return Value:
2852// None.
2853//
2854void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
2855{
2856 assert(treeNode->OperGet() == GT_IND);
2857
2858 regNumber targetReg = treeNode->gtRegNum;
2859 GenTree* op1 = treeNode->gtOp.gtOp1;
2860 assert(!op1->isContained());
2861 regNumber operandReg = genConsumeReg(op1);
2862
2863 // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
2864 regNumber tmpReg = treeNode->GetSingleTempReg();
2865 assert(tmpReg != targetReg);
2866
2867 // Load upper 4 bytes in tmpReg
2868 getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);
2869
2870 // Load lower 8 bytes in targetReg
2871 getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
2872
2873 // combine upper 4 bytes and lower 8 bytes in targetReg
2874 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2875
2876 genProduceReg(treeNode);
2877}
2878
2879//-----------------------------------------------------------------------------
2880// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2881// Since Vector3 is not a hardware supported write size, it is performed
2882// as two stores: 8 byte followed by 4-byte.
2883//
2884// Arguments:
2885// treeNode - tree node that is attempting to store TYP_SIMD12 field
2886//
2887// Return Value:
2888// None.
2889//
2890void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
2891{
2892 assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR));
2893
2894 unsigned offs = 0;
2895 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2896 assert(varNum < compiler->lvaCount);
2897
2898 if (treeNode->OperGet() == GT_LCL_FLD)
2899 {
2900 offs = treeNode->gtLclFld.gtLclOffs;
2901 }
2902
2903 GenTree* op1 = treeNode->gtOp.gtOp1;
2904 assert(!op1->isContained());
2905 regNumber operandReg = genConsumeReg(op1);
2906
2907 // Need an addtional Xmm register to extract upper 4 bytes from data.
2908 regNumber tmpReg = treeNode->GetSingleTempReg();
2909
2910 // store lower 8 bytes
2911 getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
2912
2913 // Extract upper 4-bytes from operandReg
2914 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
2915
2916 // Store upper 4 bytes
2917 getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs + 8);
2918}
2919
2920//-----------------------------------------------------------------------------
2921// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
2922// Since Vector3 is not a hardware supported read size, it is performed
2923// as two reads: 4 byte followed by 8 byte.
2924//
2925// Arguments:
2926// treeNode - tree node that is attempting to load TYP_SIMD12 field
2927//
2928// Return Value:
2929// None.
2930//
2931void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
2932{
2933 assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR));
2934
2935 regNumber targetReg = treeNode->gtRegNum;
2936 unsigned offs = 0;
2937 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2938 assert(varNum < compiler->lvaCount);
2939
2940 if (treeNode->OperGet() == GT_LCL_FLD)
2941 {
2942 offs = treeNode->gtLclFld.gtLclOffs;
2943 }
2944
2945 // Need an additional Xmm register that is different from targetReg to read upper 4 bytes.
2946 regNumber tmpReg = treeNode->GetSingleTempReg();
2947 assert(tmpReg != targetReg);
2948
2949 // Read upper 4 bytes to tmpReg
2950 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs + 8);
2951
2952 // Read lower 8 bytes to targetReg
2953 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
2954
2955 // combine upper 4 bytes and lower 8 bytes in targetReg
2956 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2957
2958 genProduceReg(treeNode);
2959}
2960
2961#ifdef _TARGET_X86_
2962
2963//-----------------------------------------------------------------------------
2964// genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
2965// Since Vector3 is not a hardware supported write size, it is performed
2966// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2967// already been adjusted.
2968//
2969// Arguments:
2970// operandReg - the xmm register containing the SIMD12 to store.
2971// tmpReg - an xmm register that can be used as a temporary for the operation.
2972//
2973// Return Value:
2974// None.
2975//
2976void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
2977{
2978 assert(genIsValidFloatReg(operandReg));
2979 assert(genIsValidFloatReg(tmpReg));
2980
2981 // 8-byte write
2982 getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
2983
2984 // Extract upper 4-bytes from data
2985 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
2986
2987 // 4-byte write
2988 getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
2989}
2990
2991//-----------------------------------------------------------------------------
2992// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2993// Since Vector3 is not a hardware supported write size, it is performed
2994// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2995// already been adjusted.
2996//
2997// Arguments:
2998// treeNode - tree node that is attempting to store TYP_SIMD12 field
2999//
3000// Return Value:
3001// None.
3002//
3003void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
3004{
3005 assert(treeNode->OperGet() == GT_PUTARG_STK);
3006
3007 GenTree* op1 = treeNode->gtOp.gtOp1;
3008 assert(!op1->isContained());
3009 regNumber operandReg = genConsumeReg(op1);
3010
3011 // Need an addtional Xmm register to extract upper 4 bytes from data.
3012 regNumber tmpReg = treeNode->GetSingleTempReg();
3013
3014 genStoreSIMD12ToStack(operandReg, tmpReg);
3015}
3016
3017#endif // _TARGET_X86_
3018
3019//-----------------------------------------------------------------------------
3020// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
3021// the given register, if any, or to memory.
3022//
3023// Arguments:
3024// simdNode - The GT_SIMD node
3025//
3026// Return Value:
3027// None.
3028//
3029// Notes:
3030// The upper half of all AVX registers is volatile, even the callee-save registers.
3031// When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
3032// to cause the upper half to be saved. It will first attempt to find another, unused, callee-save
3033// register. If such a register cannot be found, it will save it to an available caller-save register.
3034// In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
3035// value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte
3036// value will be spilled to the stack.)
3037//
3038void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
3039{
3040 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
3041
3042 GenTree* op1 = simdNode->gtGetOp1();
3043 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3044 regNumber targetReg = simdNode->gtRegNum;
3045 regNumber op1Reg = genConsumeReg(op1);
3046 assert(op1Reg != REG_NA);
3047 assert(targetReg != REG_NA);
3048 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
3049
3050 genProduceReg(simdNode);
3051}
3052
3053//-----------------------------------------------------------------------------
3054// genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
3055// the given register, if any, or to memory.
3056//
3057// Arguments:
3058// simdNode - The GT_SIMD node
3059//
3060// Return Value:
3061// None.
3062//
3063// Notes:
3064// For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
3065// have their home register, this node has its targetReg on the lclVar child, and its source
3066// on the simdNode.
3067// Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled
3068// an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike
3069// most spill scenarios, the saved tree will be different from the restored tree, but the spill
3070// restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
3071// spilled tree (saveNode) in order to perform the reload. We can easily find that tree,
3072// as it is in the spill descriptor for the register from which it was saved.
3073//
3074void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
3075{
3076 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
3077
3078 GenTree* op1 = simdNode->gtGetOp1();
3079 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3080 regNumber srcReg = simdNode->gtRegNum;
3081 regNumber lclVarReg = genConsumeReg(op1);
3082 unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
3083 assert(lclVarReg != REG_NA);
3084 assert(srcReg != REG_NA);
3085 if (simdNode->gtFlags & GTF_SPILLED)
3086 {
3087 GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
3088 noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
3089 genConsumeReg(saveNode);
3090 }
3091 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
3092}
3093
3094//------------------------------------------------------------------------
3095// genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
3096// routine which in turn calls appropriate genSIMDIntrinsicXXX() routine.
3097//
3098// Arguments:
3099// simdNode - The GT_SIMD node
3100//
3101// Return Value:
3102// None.
3103//
3104// Notes:
3105// Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
3106// a limited set of methods.
3107//
3108void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
3109{
3110 // NYI for unsupported base types
3111 if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG &&
3112 simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE &&
3113 simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE &&
3114 simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE &&
3115 simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG)
3116 {
3117 noway_assert(!"SIMD intrinsic with unsupported base type.");
3118 }
3119
3120 switch (simdNode->gtSIMDIntrinsicID)
3121 {
3122 case SIMDIntrinsicInit:
3123 genSIMDIntrinsicInit(simdNode);
3124 break;
3125
3126 case SIMDIntrinsicInitN:
3127 genSIMDIntrinsicInitN(simdNode);
3128 break;
3129
3130 case SIMDIntrinsicSqrt:
3131 case SIMDIntrinsicCast:
3132 case SIMDIntrinsicAbs:
3133 genSIMDIntrinsicUnOp(simdNode);
3134 break;
3135
3136 case SIMDIntrinsicConvertToSingle:
3137 case SIMDIntrinsicConvertToInt32:
3138 genSIMDIntrinsic32BitConvert(simdNode);
3139 break;
3140
3141 case SIMDIntrinsicConvertToDouble:
3142 case SIMDIntrinsicConvertToInt64:
3143 genSIMDIntrinsic64BitConvert(simdNode);
3144 break;
3145
3146 case SIMDIntrinsicWidenLo:
3147 case SIMDIntrinsicWidenHi:
3148 genSIMDIntrinsicWiden(simdNode);
3149 break;
3150
3151 case SIMDIntrinsicNarrow:
3152 genSIMDIntrinsicNarrow(simdNode);
3153 break;
3154
3155 case SIMDIntrinsicAdd:
3156 case SIMDIntrinsicSub:
3157 case SIMDIntrinsicMul:
3158 case SIMDIntrinsicDiv:
3159 case SIMDIntrinsicBitwiseAnd:
3160 case SIMDIntrinsicBitwiseAndNot:
3161 case SIMDIntrinsicBitwiseOr:
3162 case SIMDIntrinsicBitwiseXor:
3163 case SIMDIntrinsicMin:
3164 case SIMDIntrinsicMax:
3165 genSIMDIntrinsicBinOp(simdNode);
3166 break;
3167
3168 case SIMDIntrinsicOpEquality:
3169 case SIMDIntrinsicOpInEquality:
3170 case SIMDIntrinsicEqual:
3171 case SIMDIntrinsicLessThan:
3172 case SIMDIntrinsicGreaterThan:
3173 case SIMDIntrinsicLessThanOrEqual:
3174 case SIMDIntrinsicGreaterThanOrEqual:
3175 genSIMDIntrinsicRelOp(simdNode);
3176 break;
3177
3178 case SIMDIntrinsicDotProduct:
3179 genSIMDIntrinsicDotProduct(simdNode);
3180 break;
3181
3182 case SIMDIntrinsicGetItem:
3183 genSIMDIntrinsicGetItem(simdNode);
3184 break;
3185
3186 case SIMDIntrinsicShuffleSSE2:
3187 genSIMDIntrinsicShuffleSSE2(simdNode);
3188 break;
3189
3190 case SIMDIntrinsicSetX:
3191 case SIMDIntrinsicSetY:
3192 case SIMDIntrinsicSetZ:
3193 case SIMDIntrinsicSetW:
3194 genSIMDIntrinsicSetItem(simdNode);
3195 break;
3196
3197 case SIMDIntrinsicUpperSave:
3198 genSIMDIntrinsicUpperSave(simdNode);
3199 break;
3200 case SIMDIntrinsicUpperRestore:
3201 genSIMDIntrinsicUpperRestore(simdNode);
3202 break;
3203
3204 default:
3205 noway_assert(!"Unimplemented SIMD intrinsic.");
3206 unreached();
3207 }
3208}
3209
3210#endif // FEATURE_SIMD
3211#endif //_TARGET_XARCH_
3212