simdcodegenxarch.cpp source code [CoreCLR/jit/simdcodegenxarch.cpp]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4
5	/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*
6	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7	XX XX
8	XX Amd64 SIMD Code Generator XX
9	XX XX
10	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12	*/
13	#include "jitpch.h"
14	#ifdef _MSC_VER
15	#pragma hdrstop
16	#endif
17
18	#ifdef _TARGET_XARCH_
19	#ifdef FEATURE_SIMD
20
21	#include "emit.h"
22	#include "codegen.h"
23	#include "sideeffects.h"
24	#include "lower.h"
25	#include "gcinfo.h"
26	#include "gcinfoencoder.h"
27
28	// Instruction immediates
29
30	// Insertps:
31	// - bits 6 and 7 of the immediate indicate which source item to select (0..3)
32	// - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
33	// - bits 0 to 3 of the immediate indicate which target item to zero
34	#define INSERTPS_SOURCE_SELECT(i) (i << 6)
35	#define INSERTPS_TARGET_SELECT(i) (i << 4)
36	#define INSERTPS_ZERO(i) (1 << i)
37
38	// getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
39	//
40	// Arguments:
41	// intrinsicId - SIMD intrinsic Id
42	// baseType - Base type of the SIMD vector
43	// immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
44	//
45	//
46	// Return Value:
47	// Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
48	//
49	instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /=nullptr/)
50	{
51	// Minimal required instruction set is SSE2.
52	assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
53
54	instruction result = INS_invalid;
55	switch (intrinsicId)
56	{
57	case SIMDIntrinsicInit:
58	if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
59	{
60	// AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
61	// AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
62	// If we decide to use AVX2 only, we can remove this assert.
63	if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2))
64	{
65	assert(baseType == TYP_FLOAT \|\| baseType == TYP_DOUBLE);
66	}
67	switch (baseType)
68	{
69	case TYP_FLOAT:
70	result = INS_vbroadcastss;
71	break;
72	case TYP_DOUBLE:
73	result = INS_vbroadcastsd;
74	break;
75	case TYP_ULONG:
76	case TYP_LONG:
77	// NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
78	// to be TYP_LONG reg.
79	result = INS_vpbroadcastq;
80	break;
81	case TYP_UINT:
82	case TYP_INT:
83	result = INS_vpbroadcastd;
84	break;
85	case TYP_USHORT:
86	case TYP_SHORT:
87	result = INS_vpbroadcastw;
88	break;
89	case TYP_UBYTE:
90	case TYP_BYTE:
91	result = INS_vpbroadcastb;
92	break;
93	default:
94	unreached();
95	}
96	break;
97	}
98
99	// For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
100	__fallthrough;
101
102	case SIMDIntrinsicShuffleSSE2:
103	if (baseType == TYP_FLOAT)
104	{
105	result = INS_shufps;
106	}
107	else if (baseType == TYP_DOUBLE)
108	{
109	result = INS_shufpd;
110	}
111	else if (baseType == TYP_INT \|\| baseType == TYP_UINT)
112	{
113	result = INS_pshufd;
114	}
115	else if (baseType == TYP_LONG \|\| baseType == TYP_ULONG)
116	{
117	// We don't have a separate SSE2 instruction and will
118	// use the instruction meant for doubles since it is
119	// of the same size as a long.
120	result = INS_shufpd;
121	}
122	break;
123
124	case SIMDIntrinsicSqrt:
125	if (baseType == TYP_FLOAT)
126	{
127	result = INS_sqrtps;
128	}
129	else if (baseType == TYP_DOUBLE)
130	{
131	result = INS_sqrtpd;
132	}
133	else
134	{
135	unreached();
136	}
137	break;
138
139	case SIMDIntrinsicAdd:
140	if (baseType == TYP_FLOAT)
141	{
142	result = INS_addps;
143	}
144	else if (baseType == TYP_DOUBLE)
145	{
146	result = INS_addpd;
147	}
148	else if (baseType == TYP_INT \|\| baseType == TYP_UINT)
149	{
150	result = INS_paddd;
151	}
152	else if (baseType == TYP_USHORT \|\| baseType == TYP_SHORT)
153	{
154	result = INS_paddw;
155	}
156	else if (baseType == TYP_UBYTE \|\| baseType == TYP_BYTE)
157	{
158	result = INS_paddb;
159	}
160	else if (baseType == TYP_LONG \|\| baseType == TYP_ULONG)
161	{
162	result = INS_paddq;
163	}
164	break;
165
166	case SIMDIntrinsicSub:
167	if (baseType == TYP_FLOAT)
168	{
169	result = INS_subps;
170	}
171	else if (baseType == TYP_DOUBLE)
172	{
173	result = INS_subpd;
174	}
175	else if (baseType == TYP_INT \|\| baseType == TYP_UINT)
176	{
177	result = INS_psubd;
178	}
179	else if (baseType == TYP_USHORT \|\| baseType == TYP_SHORT)
180	{
181	result = INS_psubw;
182	}
183	else if (baseType == TYP_UBYTE \|\| baseType == TYP_BYTE)
184	{
185	result = INS_psubb;
186	}
187	else if (baseType == TYP_LONG \|\| baseType == TYP_ULONG)
188	{
189	result = INS_psubq;
190	}
191	break;
192
193	case SIMDIntrinsicMul:
194	if (baseType == TYP_FLOAT)
195	{
196	result = INS_mulps;
197	}
198	else if (baseType == TYP_DOUBLE)
199	{
200	result = INS_mulpd;
201	}
202	else if (baseType == TYP_SHORT)
203	{
204	result = INS_pmullw;
205	}
206	else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
207	{
208	result = INS_pmulld;
209	}
210	break;
211
212	case SIMDIntrinsicDiv:
213	if (baseType == TYP_FLOAT)
214	{
215	result = INS_divps;
216	}
217	else if (baseType == TYP_DOUBLE)
218	{
219	result = INS_divpd;
220	}
221	else
222	{
223	unreached();
224	}
225	break;
226
227	case SIMDIntrinsicMin:
228	if (baseType == TYP_FLOAT)
229	{
230	result = INS_minps;
231	}
232	else if (baseType == TYP_DOUBLE)
233	{
234	result = INS_minpd;
235	}
236	else if (baseType == TYP_UBYTE)
237	{
238	result = INS_pminub;
239	}
240	else if (baseType == TYP_SHORT)
241	{
242	result = INS_pminsw;
243	}
244	else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
245	{
246	if (baseType == TYP_BYTE)
247	{
248	result = INS_pminsb;
249	}
250	else if (baseType == TYP_USHORT)
251	{
252	result = INS_pminuw;
253	}
254	else if (baseType == TYP_INT)
255	{
256	result = INS_pminsd;
257	}
258	else if (baseType == TYP_UINT)
259	{
260	result = INS_pminud;
261	}
262	}
263	else
264	{
265	unreached();
266	}
267	break;
268
269	case SIMDIntrinsicMax:
270	if (baseType == TYP_FLOAT)
271	{
272	result = INS_maxps;
273	}
274	else if (baseType == TYP_DOUBLE)
275	{
276	result = INS_maxpd;
277	}
278	else if (baseType == TYP_UBYTE)
279	{
280	result = INS_pmaxub;
281	}
282	else if (baseType == TYP_SHORT)
283	{
284	result = INS_pmaxsw;
285	}
286	else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
287	{
288	if (baseType == TYP_BYTE)
289	{
290	result = INS_pmaxsb;
291	}
292	else if (baseType == TYP_USHORT)
293	{
294	result = INS_pmaxuw;
295	}
296	else if (baseType == TYP_INT)
297	{
298	result = INS_pmaxsd;
299	}
300	else if (baseType == TYP_UINT)
301	{
302	result = INS_pmaxud;
303	}
304	}
305	else
306	{
307	unreached();
308	}
309	break;
310
311	case SIMDIntrinsicAbs:
312	if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
313	{
314	if (baseType == TYP_INT)
315	{
316	result = INS_pabsd;
317	}
318	else if (baseType == TYP_SHORT)
319	{
320	result = INS_pabsw;
321	}
322	else if (baseType == TYP_BYTE)
323	{
324	result = INS_pabsb;
325	}
326	}
327	break;
328
329	case SIMDIntrinsicEqual:
330	if (baseType == TYP_FLOAT)
331	{
332	result = INS_cmpps;
333	assert(ival != nullptr);
334	*ival = `0`;
335	}
336	else if (baseType == TYP_DOUBLE)
337	{
338	result = INS_cmppd;
339	assert(ival != nullptr);
340	*ival = `0`;
341	}
342	else if (baseType == TYP_INT \|\| baseType == TYP_UINT)
343	{
344	result = INS_pcmpeqd;
345	}
346	else if (baseType == TYP_USHORT \|\| baseType == TYP_SHORT)
347	{
348	result = INS_pcmpeqw;
349	}
350	else if (baseType == TYP_UBYTE \|\| baseType == TYP_BYTE)
351	{
352	result = INS_pcmpeqb;
353	}
354	else if ((baseType == TYP_ULONG \|\| baseType == TYP_LONG) &&
355	(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
356	{
357	result = INS_pcmpeqq;
358	}
359	break;
360
361	case SIMDIntrinsicLessThan:
362	// Packed integers use > with swapped operands
363	assert(baseType != TYP_INT);
364
365	if (baseType == TYP_FLOAT)
366	{
367	result = INS_cmpps;
368	assert(ival != nullptr);
369	*ival = `1`;
370	}
371	else if (baseType == TYP_DOUBLE)
372	{
373	result = INS_cmppd;
374	assert(ival != nullptr);
375	*ival = `1`;
376	}
377	break;
378
379	case SIMDIntrinsicLessThanOrEqual:
380	// Packed integers use (a==b) \|\| ( b > a) in place of a <= b.
381	assert(baseType != TYP_INT);
382
383	if (baseType == TYP_FLOAT)
384	{
385	result = INS_cmpps;
386	assert(ival != nullptr);
387	*ival = `2`;
388	}
389	else if (baseType == TYP_DOUBLE)
390	{
391	result = INS_cmppd;
392	assert(ival != nullptr);
393	*ival = `2`;
394	}
395	break;
396
397	case SIMDIntrinsicGreaterThan:
398	// Packed float/double use < with swapped operands
399	assert(!varTypeIsFloating(baseType));
400
401	// SSE2 supports only signed >
402	if (baseType == TYP_INT)
403	{
404	result = INS_pcmpgtd;
405	}
406	else if (baseType == TYP_SHORT)
407	{
408	result = INS_pcmpgtw;
409	}
410	else if (baseType == TYP_BYTE)
411	{
412	result = INS_pcmpgtb;
413	}
414	else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
415	{
416	result = INS_pcmpgtq;
417	}
418	break;
419
420	case SIMDIntrinsicBitwiseAnd:
421	if (baseType == TYP_FLOAT)
422	{
423	result = INS_andps;
424	}
425	else if (baseType == TYP_DOUBLE)
426	{
427	result = INS_andpd;
428	}
429	else if (varTypeIsIntegral(baseType))
430	{
431	result = INS_pand;
432	}
433	break;
434
435	case SIMDIntrinsicBitwiseAndNot:
436	if (baseType == TYP_FLOAT)
437	{
438	result = INS_andnps;
439	}
440	else if (baseType == TYP_DOUBLE)
441	{
442	result = INS_andnpd;
443	}
444	else if (baseType == TYP_INT)
445	{
446	result = INS_pandn;
447	}
448	else if (varTypeIsIntegral(baseType))
449	{
450	result = INS_pandn;
451	}
452	break;
453
454	case SIMDIntrinsicBitwiseOr:
455	if (baseType == TYP_FLOAT)
456	{
457	result = INS_orps;
458	}
459	else if (baseType == TYP_DOUBLE)
460	{
461	result = INS_orpd;
462	}
463	else if (varTypeIsIntegral(baseType))
464	{
465	result = INS_por;
466	}
467	break;
468
469	case SIMDIntrinsicBitwiseXor:
470	if (baseType == TYP_FLOAT)
471	{
472	result = INS_xorps;
473	}
474	else if (baseType == TYP_DOUBLE)
475	{
476	result = INS_xorpd;
477	}
478	else if (varTypeIsIntegral(baseType))
479	{
480	result = INS_pxor;
481	}
482	break;
483
484	case SIMDIntrinsicCast:
485	result = INS_movaps;
486	break;
487
488	case SIMDIntrinsicConvertToSingle:
489	result = INS_cvtdq2ps;
490	break;
491
492	case SIMDIntrinsicConvertToDouble:
493	assert(baseType == TYP_LONG);
494	result = INS_cvtsi2sd;
495	break;
496
497	case SIMDIntrinsicConvertToInt32:
498	assert(baseType == TYP_FLOAT);
499	result = INS_cvttps2dq;
500	break;
501
502	case SIMDIntrinsicConvertToInt64:
503	assert(baseType == TYP_DOUBLE);
504	result = INS_cvttsd2si;
505	break;
506
507	case SIMDIntrinsicNarrow:
508	// Note that for the integer types the caller must zero the upper bits of
509	// each source element, since the instructions saturate.
510	switch (baseType)
511	{
512	case TYP_INT:
513	case TYP_UINT:
514	if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
515	{
516	result = INS_packusdw;
517	}
518	else
519	{
520	result = INS_packssdw;
521	}
522	break;
523	case TYP_SHORT:
524	case TYP_USHORT:
525	result = INS_packuswb;
526	break;
527	default:
528	assert(!"Invalid baseType for SIMDIntrinsicNarrow");
529	result = INS_invalid;
530	break;
531	}
532	break;
533
534	case SIMDIntrinsicWidenLo:
535	// Some of these have multiple instruction implementations, with one instruction to widen the lo half,
536	// and another to widen the hi half.
537	switch (baseType)
538	{
539	case TYP_FLOAT:
540	result = INS_cvtps2pd;
541	break;
542	case TYP_INT:
543	case TYP_UINT:
544	result = INS_punpckldq;
545	break;
546	case TYP_SHORT:
547	case TYP_USHORT:
548	result = INS_punpcklwd;
549	break;
550	case TYP_BYTE:
551	case TYP_UBYTE:
552	result = INS_punpcklbw;
553	break;
554	default:
555	assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
556	result = INS_invalid;
557	break;
558	}
559	break;
560
561	case SIMDIntrinsicWidenHi:
562	switch (baseType)
563	{
564	case TYP_FLOAT:
565	// For this case, we actually use the same instruction.
566	result = INS_cvtps2pd;
567	break;
568	case TYP_INT:
569	case TYP_UINT:
570	result = INS_punpckhdq;
571	break;
572	case TYP_SHORT:
573	case TYP_USHORT:
574	result = INS_punpckhwd;
575	break;
576	case TYP_BYTE:
577	case TYP_UBYTE:
578	result = INS_punpckhbw;
579	break;
580	default:
581	assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
582	result = INS_invalid;
583	break;
584	}
585	break;
586
587	case SIMDIntrinsicShiftLeftInternal:
588	switch (baseType)
589	{
590	case TYP_SIMD16:
591	// For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
592	result = INS_pslldq;
593	break;
594	case TYP_UINT:
595	case TYP_INT:
596	result = INS_pslld;
597	break;
598	case TYP_SHORT:
599	case TYP_USHORT:
600	result = INS_psllw;
601	break;
602	default:
603	assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
604	result = INS_invalid;
605	break;
606	}
607	break;
608
609	case SIMDIntrinsicShiftRightInternal:
610	switch (baseType)
611	{
612	case TYP_SIMD16:
613	// For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
614	result = INS_psrldq;
615	break;
616	case TYP_UINT:
617	case TYP_INT:
618	result = INS_psrld;
619	break;
620	case TYP_SHORT:
621	case TYP_USHORT:
622	result = INS_psrlw;
623	break;
624	default:
625	assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
626	result = INS_invalid;
627	break;
628	}
629	break;
630
631	case SIMDIntrinsicUpperSave:
632	result = INS_vextractf128;
633	break;
634
635	case SIMDIntrinsicUpperRestore:
636	result = INS_insertps;
637	break;
638
639	default:
640	assert(!"Unsupported SIMD intrinsic");
641	unreached();
642	}
643
644	noway_assert(result != INS_invalid);
645	return result;
646	}
647
648	// genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
649	// to target mm reg, zeroing out the upper bits if and only if specified.
650	//
651	// Arguments:
652	// targetType the target type
653	// baseType the base type of value to be moved
654	// targetReg the target reg
655	// srcReg the src reg
656	// moveType action to be performed on target upper bits
657	//
658	// Return Value:
659	// None
660	//
661	// Notes:
662	// This is currently only supported for floating point types.
663	//
664	void CodeGen::genSIMDScalarMove(
665	var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
666	{
667	assert(varTypeIsFloating(baseType));
668	switch (moveType)
669	{
670	case SMT_PreserveUpper:
671	if (srcReg != targetReg)
672	{
673	instruction ins = ins_Store(baseType);
674	if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
675	{
676	// In general, when we use a three-operands move instruction, we want to merge the src with
677	// itself. This is an exception in that we actually want the "merge" behavior, so we must
678	// specify it with all 3 operands.
679	inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
680	}
681	else
682	{
683	inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
684	}
685	}
686	break;
687
688	case SMT_ZeroInitUpper:
689	if (compiler->canUseVexEncoding())
690	{
691	// insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
692	// The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
693	// to zero all but the lower bits.
694	unsigned int insertpsImm =
695	(INSERTPS_TARGET_SELECT(`0`) \| INSERTPS_ZERO(`1`) \| INSERTPS_ZERO(`2`) \| INSERTPS_ZERO(`3`));
696	inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
697	}
698	else
699	{
700	if (srcReg == targetReg)
701	{
702	// There is no guarantee that upper bits of op1Reg are zero.
703	// We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
704	instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
705	getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, `12`);
706	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
707	getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, `12`);
708	}
709	else
710	{
711	genSIMDZero(targetType, TYP_FLOAT, targetReg);
712	inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
713	}
714	}
715	break;
716
717	case SMT_ZeroInitUpper_SrcHasUpperZeros:
718	if (srcReg != targetReg)
719	{
720	instruction ins = ins_Copy(baseType);
721	assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
722	inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
723	}
724	break;
725
726	default:
727	unreached();
728	}
729	}
730
731	void CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg)
732	{
733	// We just use `INS_xorps` instead of `getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType)`
734	// since `genSIMDZero` is used for both `System.Numerics.Vectors` and HardwareIntrinsics. Modern
735	// CPUs handle this specially in the renamer and it never hits the execution pipeline, additionally
736	// `INS_xorps` is always available (when using either the legacy or VEX encoding).
737	inst_RV_RV(INS_xorps, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
738	}
739
740	//------------------------------------------------------------------------
741	// genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
742	//
743	// Arguments:
744	// simdNode - The GT_SIMD node
745	//
746	// Return Value:
747	// None.
748	//
749	void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
750	{
751	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
752
753	GenTree* op1 = simdNode->gtGetOp1();
754	var_types baseType = simdNode->gtSIMDBaseType;
755	regNumber targetReg = simdNode->gtRegNum;
756	assert(targetReg != REG_NA);
757	var_types targetType = simdNode->TypeGet();
758	SIMDLevel level = compiler->getSIMDSupportLevel();
759	unsigned size = simdNode->gtSIMDSize;
760
761	// Should never see small int base type vectors except for zero initialization.
762	noway_assert(!varTypeIsSmallInt(baseType) \|\| op1->IsIntegralConst(`0`));
763
764	instruction ins = INS_invalid;
765
766	#if !defined(_TARGET_64BIT_)
767	if (op1->OperGet() == GT_LONG)
768	{
769	assert(varTypeIsLong(baseType));
770
771	GenTree* op1lo = op1->gtGetOp1();
772	GenTree* op1hi = op1->gtGetOp2();
773
774	if (op1lo->IsIntegralConst(`0`) && op1hi->IsIntegralConst(`0`))
775	{
776	genSIMDZero(targetType, baseType, targetReg);
777	}
778	else if (op1lo->IsIntegralConst(-`1`) && op1hi->IsIntegralConst(-`1`))
779	{
780	// Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
781	ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
782	inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
783	}
784	else
785	{
786	// Generate:
787	// mov_i2xmm targetReg, op1lo
788	// mov_i2xmm xmmtmp, op1hi
789	// shl xmmtmp, 4 bytes
790	// por targetReg, xmmtmp
791	// Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
792	// shufpd targetReg, targetReg, 0 // move the long to all the lanes
793	// For AVX2, move it to all 4 of the 64-bit lanes using:
794	// vpbroadcastq targetReg, targetReg
795
796	instruction ins;
797
798	regNumber op1loReg = genConsumeReg(op1lo);
799	ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
800	inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
801
802	regNumber tmpReg = simdNode->GetSingleTempReg();
803
804	regNumber op1hiReg = genConsumeReg(op1hi);
805	ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
806	inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
807
808	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
809	getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, `4`); // shift left by 4 bytes
810
811	ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
812	inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
813
814	if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
815	{
816	inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
817	}
818	else
819	{
820	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
821	getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, `0`);
822	}
823	}
824	}
825	else
826	#endif // !defined(_TARGET_64BIT_)
827	if (op1->isContained())
828	{
829	if (op1->IsIntegralConst(`0`) \|\| op1->IsFPZero())
830	{
831	genSIMDZero(targetType, baseType, targetReg);
832	}
833	else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-`1`))
834	{
835	// case of initializing elements of vector with all 1's
836	// generate pcmpeqd reg, reg
837	ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
838	inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
839	}
840	else
841	{
842	assert(level == SIMD_AVX2_Supported);
843	ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
844	if (op1->IsCnsFltOrDbl())
845	{
846	getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
847	}
848	else if (op1->OperIsLocalAddr())
849	{
850	unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : `0`;
851	getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum,
852	offset);
853	}
854	else
855	{
856	unreached();
857	}
858	}
859	}
860	else if (level == SIMD_AVX2_Supported && ((size == `32`) \|\| (size == `16`)))
861	{
862	regNumber srcReg = genConsumeReg(op1);
863	if (baseType == TYP_INT \|\| baseType == TYP_UINT \|\| baseType == TYP_LONG \|\| baseType == TYP_ULONG)
864	{
865	ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
866	assert(ins != INS_invalid);
867	inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
868	srcReg = targetReg;
869	}
870
871	ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
872	getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
873	}
874	else
875	{
876	// If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
877	// In either case we are going to use the SSE2 shuffle instruction.
878
879	regNumber op1Reg = genConsumeReg(op1);
880	unsigned shuffleControl = `0`;
881
882	if (compiler->isSubRegisterSIMDType(simdNode))
883	{
884	assert(baseType == TYP_FLOAT);
885
886	// We cannot assume that upper bits of op1Reg or targetReg be zero.
887	// Therefore we need to explicitly zero out upper bits. This is
888	// essential for the shuffle operation performed below.
889	//
890	// If op1 is a float/double constant, we would have loaded it from
891	// data section using movss/sd. Similarly if op1 is a memory op we
892	// would have loaded it using movss/sd. Movss/sd when loading a xmm reg
893	// from memory would zero-out upper bits. In these cases we can
894	// avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
895	// if they are not the same.
896	SIMDScalarMoveType moveType =
897	op1->IsCnsFltOrDbl() \|\| op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper;
898
899	genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType);
900
901	if (size == `8`)
902	{
903	shuffleControl = `0x50`;
904	}
905	else if (size == `12`)
906	{
907	shuffleControl = `0x40`;
908	}
909	else
910	{
911	noway_assert(!"Unexpected size for SIMD type");
912	}
913	}
914	else // Vector<T>
915	{
916	if (op1Reg != targetReg)
917	{
918	if (varTypeIsFloating(baseType))
919	{
920	ins = ins_Copy(targetType);
921	}
922	else if (baseType == TYP_INT \|\| baseType == TYP_UINT \|\| baseType == TYP_LONG \|\| baseType == TYP_ULONG)
923	{
924	ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
925	}
926
927	assert(ins != INS_invalid);
928	inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
929	}
930	}
931
932	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
933	getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
934	}
935
936	genProduceReg(simdNode);
937	}
938
939	//-------------------------------------------------------------------------------------------
940	// genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
941	// a number of arguments equal to the length of the Vector.
942	//
943	// Arguments:
944	// simdNode - The GT_SIMD node
945	//
946	// Return Value:
947	// None.
948	//
949	void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
950	{
951	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
952
953	// Right now this intrinsic is supported only on TYP_FLOAT vectors
954	var_types baseType = simdNode->gtSIMDBaseType;
955	noway_assert(baseType == TYP_FLOAT);
956
957	regNumber targetReg = simdNode->gtRegNum;
958	assert(targetReg != REG_NA);
959
960	var_types targetType = simdNode->TypeGet();
961
962	// Note that we cannot use targetReg before consumed all source operands. Therefore,
963	// Need an internal register to stitch together all the values into a single vector
964	// in an XMM reg.
965	regNumber vectorReg = simdNode->GetSingleTempReg();
966
967	// Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size.
968	// For example in case of Vector4f we don't need to zero when using SSE2.
969	if (compiler->isSubRegisterSIMDType(simdNode))
970	{
971	genSIMDZero(targetType, baseType, vectorReg);
972	}
973
974	unsigned int baseTypeSize = genTypeSize(baseType);
975	instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
976
977	// We will first consume the list items in execution (left to right) order,
978	// and record the registers.
979	regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT];
980	unsigned initCount = `0`;
981	for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
982	{
983	assert(list->OperGet() == GT_LIST);
984	GenTree* listItem = list->gtGetOp1();
985	assert(listItem->TypeGet() == baseType);
986	assert(!listItem->isContained());
987	regNumber operandReg = genConsumeReg(listItem);
988	operandRegs[initCount] = operandReg;
989	initCount++;
990	}
991
992	unsigned int offset = `0`;
993	for (unsigned i = `0`; i < initCount; i++)
994	{
995	// We will now construct the vector from the list items in reverse order.
996	// This allows us to efficiently stitch together a vector as follows:
997	// vectorReg = (vectorReg << offset)
998	// VectorReg[0] = listItemReg
999	// Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
1000	// bits of vectorReg are not modified.
1001
1002	regNumber operandReg = operandRegs[initCount - i - `1`];
1003	if (offset != `0`)
1004	{
1005	getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
1006	}
1007	genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper);
1008
1009	offset += baseTypeSize;
1010	}
1011
1012	noway_assert(offset == simdNode->gtSIMDSize);
1013
1014	// Load the initialized value.
1015	if (targetReg != vectorReg)
1016	{
1017	inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
1018	}
1019	genProduceReg(simdNode);
1020	}
1021
1022	//----------------------------------------------------------------------------------
1023	// genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
1024	//
1025	// Arguments:
1026	// simdNode - The GT_SIMD node
1027	//
1028	// Return Value:
1029	// None.
1030	//
1031	void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
1032	{
1033	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt \|\| simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast \|\|
1034	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAbs);
1035
1036	GenTree* op1 = simdNode->gtGetOp1();
1037	var_types baseType = simdNode->gtSIMDBaseType;
1038	regNumber targetReg = simdNode->gtRegNum;
1039	assert(targetReg != REG_NA);
1040	var_types targetType = simdNode->TypeGet();
1041
1042	regNumber op1Reg = genConsumeReg(op1);
1043	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1044	if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast \|\| targetReg != op1Reg)
1045	{
1046	inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1047	}
1048	genProduceReg(simdNode);
1049	}
1050
1051	//----------------------------------------------------------------------------------
1052	// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
1053	//
1054	// Arguments:
1055	// simdNode - The GT_SIMD node
1056	//
1057	// Return Value:
1058	// None.
1059	//
1060	void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
1061	{
1062	SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1063	assert((intrinsicID == SIMDIntrinsicConvertToSingle) \|\| (intrinsicID == SIMDIntrinsicConvertToInt32));
1064
1065	GenTree* op1 = simdNode->gtGetOp1();
1066	var_types baseType = simdNode->gtSIMDBaseType;
1067	regNumber targetReg = simdNode->gtRegNum;
1068	assert(targetReg != REG_NA);
1069	var_types targetType = simdNode->TypeGet();
1070
1071	regNumber op1Reg = genConsumeReg(op1);
1072	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1073	if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
1074	{
1075	regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1076	regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1077	regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1078	assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1079
1080	// We will generate the following:
1081	// vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1082	// vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1083	// vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg)
1084	// vpslld tmpReg2, 16
1085	// vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2)
1086	// mov tmpIntReg, 0x5300000053000000
1087	// vmovd tmpReg, tmpIntReg
1088	// vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src)
1089	// vorps targetReg, tmpReg
1090	// vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg)
1091	// vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2)
1092	// vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits)
1093	inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
1094	if (targetReg != op1Reg)
1095	{
1096	inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
1097	}
1098
1099	// prepare upper 16 bits
1100	getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, `16`);
1101
1102	// prepare lower 16 bits
1103	getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, `16`);
1104	getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, `16`);
1105
1106	// prepare mask
1107	#ifdef _TARGET_AMD64_
1108	getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)`0X5300000053000000`);
1109	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1110	#else
1111	if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1112	{
1113	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X53000000`);
1114	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1115	}
1116	else
1117	{
1118	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X00005300`);
1119	inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1120	getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, `1`);
1121	getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, `3`);
1122	}
1123	#endif
1124	if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1125	{
1126	inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1127	}
1128	else
1129	{
1130	inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1131	}
1132
1133	// convert upper 16 bits
1134	inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1135	inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1136
1137	// convert lower 16 bits
1138	inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
1139
1140	// add lower 16 bits and upper 16 bits
1141	inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
1142	}
1143	else
1144	{
1145	inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1146	}
1147	genProduceReg(simdNode);
1148	}
1149
1150	//----------------------------------------------------------------------------------
1151	// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
1152	//
1153	// Arguments:
1154	// intrinsicID the SIMD intrinsic ID
1155	// simdType the SIMD node type
1156	// baseType the base type of value to be converted
1157	// tmpReg the tmp reg
1158	// tmpIntReg the tmp integer reg
1159	// targetReg the target reg
1160	//
1161	// Return Value:
1162	// None.
1163	//
1164	void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
1165	var_types simdType,
1166	var_types baseType,
1167	regNumber tmpReg,
1168	regNumber tmpIntReg,
1169	regNumber targetReg)
1170	{
1171	instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
1172	if (intrinsicID == SIMDIntrinsicConvertToDouble)
1173	{
1174	// Note that for mov_xmm2i, the int register is always in the reg2 position
1175	inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
1176	inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
1177	}
1178	else
1179	{
1180	inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
1181	inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
1182	}
1183	}
1184
1185	//----------------------------------------------------------------------------------
1186	// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
1187	//
1188	// Arguments:
1189	// simdNode - The GT_SIMD node
1190	//
1191	// Notes:
1192	// There are no instructions for converting to/from 64-bit integers, so for these we
1193	// do the conversion an element at a time.
1194	//
1195	void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
1196	{
1197	SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1198	assert((intrinsicID == SIMDIntrinsicConvertToDouble) \|\| (intrinsicID == SIMDIntrinsicConvertToInt64));
1199
1200	GenTree* op1 = simdNode->gtGetOp1();
1201	var_types baseType = simdNode->gtSIMDBaseType;
1202	regNumber targetReg = simdNode->gtRegNum;
1203	assert(targetReg != REG_NA);
1204	var_types simdType = simdNode->TypeGet();
1205	regNumber op1Reg = genConsumeReg(op1);
1206	regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1207	regNumber tmpReg;
1208	regNumber tmpReg2;
1209	regNumber tmpReg3;
1210	SIMDLevel level = compiler->getSIMDSupportLevel();
1211
1212	#ifdef _TARGET_X86_
1213	if (baseType == TYP_LONG)
1214	{
1215	tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1216	tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1217	tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1218	assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
1219	}
1220	else
1221	#endif
1222	if (level == SIMD_AVX2_Supported \|\| (baseType == TYP_ULONG))
1223	{
1224	tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1225	tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1226	tmpReg3 = REG_NA;
1227	assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1228	}
1229	else
1230	{
1231	tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1232	assert(tmpReg != op1Reg);
1233	tmpReg2 = REG_NA;
1234	tmpReg3 = REG_NA;
1235	}
1236
1237	if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
1238	{
1239	// We will generate the following
1240	// vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1241	// vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1242	// vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg)
1243	// vpsllq tmpReg2, 32
1244	// vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2)
1245	// mov tmpIntReg, 0x4530000000000000
1246	// vmovd tmpReg, tmpIntReg
1247	// vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src)
1248	// vorpd targetReg, tmpReg
1249	// vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg)
1250	// mov tmpIntReg, 0x4330000000000000
1251	// vmovd tmpReg, tmpIntReg
1252	// vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src)
1253	// vorpd tmpReg2, tmpReg
1254	// vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2)
1255	// vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together)
1256	inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1257	if (targetReg != op1Reg)
1258	{
1259	inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
1260	}
1261
1262	// prepare upper 32 bits
1263	getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, `32`);
1264
1265	// prepare lower 32 bits
1266	getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, `32`);
1267	getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, `32`);
1268
1269	// prepare mask for converting upper 32 bits
1270	#ifdef _TARGET_AMD64_
1271	getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)`0X4530000000000000`);
1272	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1273	#else
1274	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X45300000`);
1275	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1276	getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, `4`);
1277	#endif
1278	if (level == SIMD_AVX2_Supported)
1279	{
1280	inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1281	}
1282	else
1283	{
1284	inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1285	}
1286
1287	// convert upper 32 bits
1288	inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1289	inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1290
1291	// prepare mask for converting lower 32 bits
1292	#ifdef _TARGET_AMD64_
1293	getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)`0X4330000000000000`);
1294	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1295	#else
1296	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X43300000`);
1297	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1298	getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, `4`);
1299	#endif
1300	if (level == SIMD_AVX2_Supported)
1301	{
1302	inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1303	}
1304	else
1305	{
1306	inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1307	}
1308
1309	// convert lower 32 bits
1310	inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1311	inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1312
1313	// add lower 32 bits and upper 32 bits
1314	inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1315	}
1316	else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
1317	{
1318	#ifdef _TARGET_AMD64_
1319	instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1320	instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1321
1322	if (level == SIMD_AVX2_Supported)
1323	{
1324	// Extract the high 16-bits
1325	getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, `0x01`);
1326
1327	// Put v[3] (the high-order element) in tmpReg2 and convert it.
1328	inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1329	getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, `8`);
1330	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1331
1332	// Shift the resulting 64-bits left.
1333	getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, `8`);
1334
1335	// Convert v[2], in the lo bits of tmpReg.
1336	// For the convert to double, the convert preserves the upper bits in tmpReg2.
1337	// For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1338	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
1339	}
1340
1341	// Put v[1] in tmpReg.
1342	inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1343	getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, `8`);
1344
1345	// At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1346	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1347
1348	// Shift the resulting 64-bits left.
1349	getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, `8`);
1350
1351	// Convert the lo 64-bits into targetReg
1352	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
1353
1354	// Merge or copy the results (only at this point are we done with op1Reg).
1355	if (tmpReg != targetReg)
1356	{
1357	inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1358	}
1359
1360	if (level == SIMD_AVX2_Supported)
1361	{
1362	getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, `0x01`);
1363	}
1364	#else
1365	// get the sign bit and put it in tmpReg3
1366	inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
1367	getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, `63`);
1368	getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, `63`);
1369
1370	// get the absolute value of src and put it into tmpReg2 and targetReg
1371	inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1372	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
1373	getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, `32`);
1374	inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1375	inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1376	inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
1377
1378	// prepare upper 32 bits
1379	getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, `32`);
1380
1381	// prepare lower 32 bits
1382	getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, `32`);
1383	getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, `32`);
1384
1385	// prepare mask for converting upper 32 bits
1386	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X45300000`);
1387	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1388	getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, `4`);
1389
1390	if (level == SIMD_AVX2_Supported)
1391	{
1392	inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1393	}
1394	else
1395	{
1396	inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1397	}
1398
1399	// convert upper 32 bits
1400	inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1401	inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1402
1403	// prepare mask for converting lower 32 bits
1404	getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)`0X43300000`);
1405	inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1406	getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, `4`);
1407
1408	if (level == SIMD_AVX2_Supported)
1409	{
1410	inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1411	}
1412	else
1413	{
1414	inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1415	}
1416
1417	// convert lower 32 bits
1418	inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1419	inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1420
1421	// add lower 32 bits and upper 32 bits
1422	inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1423
1424	// add sign bit
1425	inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
1426	#endif
1427	}
1428	else
1429	{
1430	instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1431	instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1432
1433	if (level == SIMD_AVX2_Supported)
1434	{
1435	// Extract the high 16-bits
1436	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, `0x01`);
1437
1438	// Put v[3] (the high-order element) in tmpReg2 and convert it.
1439	inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1440	getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, `8`);
1441	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1442
1443	// Shift the resulting 64-bits left.
1444	getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, `8`);
1445
1446	// Convert v[2], in the lo bits of tmpReg.
1447	// For the convert to double, the convert preserves the upper bits in tmpReg2.
1448	// For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1449	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1450	inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1451	}
1452
1453	// Put v[1] in tmpReg.
1454	inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1455	getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, `8`);
1456
1457	// At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1458	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1459
1460	// Shift the resulting 64-bits left.
1461	getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, `8`);
1462
1463	// Convert the lo 64-bits into targetReg
1464	genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
1465
1466	// Merge or copy the results (only at this point are we done with op1Reg).
1467	assert(tmpReg != targetReg);
1468	inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1469	if (level == SIMD_AVX2_Supported)
1470	{
1471	getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, `0x01`);
1472	}
1473	}
1474	genProduceReg(simdNode);
1475	}
1476
1477	//--------------------------------------------------------------------------------
1478	// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
1479	//
1480	// Arguments:
1481	// simdNode - The GT_SIMD node
1482	//
1483	// Notes:
1484	// This is used for the WidenHi intrinsic to extract the upper half.
1485	// On SSE, this is 8 bytes, and on AVX2 it is 16 bytes.*
1486	//
1487	void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
1488	{
1489	var_types simdType = simdNode->TypeGet();
1490	emitAttr emitSize = emitActualTypeSize(simdType);
1491	if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1492	{
1493	instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
1494	getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, `0x01`);
1495	}
1496	else
1497	{
1498	instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1499	if (tgtReg != srcReg)
1500	{
1501	inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
1502	}
1503	getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, `8`);
1504	}
1505	}
1506
1507	//--------------------------------------------------------------------------------
1508	// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
1509	//
1510	// Arguments:
1511	// simdNode - The GT_SIMD node
1512	//
1513	// Notes:
1514	// The Widen intrinsics are broken into separate intrinsics for the two results.
1515	//
1516	void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
1517	{
1518	assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) \|\|
1519	(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
1520
1521	GenTree* op1 = simdNode->gtGetOp1();
1522	var_types baseType = simdNode->gtSIMDBaseType;
1523	regNumber targetReg = simdNode->gtRegNum;
1524	assert(targetReg != REG_NA);
1525	var_types simdType = simdNode->TypeGet();
1526	SIMDLevel level = compiler->getSIMDSupportLevel();
1527
1528	genConsumeOperands(simdNode);
1529	regNumber op1Reg = op1->gtRegNum;
1530	regNumber srcReg = op1Reg;
1531	emitAttr emitSize = emitActualTypeSize(simdType);
1532	instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1533
1534	if (baseType == TYP_FLOAT)
1535	{
1536	if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1537	{
1538	genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
1539	srcReg = targetReg;
1540	}
1541	inst_RV_RV(widenIns, targetReg, srcReg, simdType);
1542	}
1543	else
1544	{
1545	// We will generate the following on AVX:
1546	// vpermq targetReg, op1Reg, 0xd4\|0xe8
1547	// vpxor tmpReg, tmpReg
1548	// vpcmpgt[b\|w\|d] tmpReg, targetReg (if basetype is signed)
1549	// vpunpck[l\|h][bw\|wd\|dq] targetReg, tmpReg
1550	regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1551	assert(tmpReg != op1Reg);
1552
1553	if (level == SIMD_AVX2_Supported)
1554	{
1555	// permute op1Reg and put it into targetReg
1556	unsigned ival = `0xd4`;
1557	if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1558	{
1559	ival = `0xe8`;
1560	}
1561	getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
1562	}
1563	else if (targetReg != op1Reg)
1564	{
1565	inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1566	}
1567
1568	genSIMDZero(simdType, baseType, tmpReg);
1569	if (!varTypeIsUnsigned(baseType))
1570	{
1571	instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
1572	inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
1573	}
1574	inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
1575	}
1576	genProduceReg(simdNode);
1577	}
1578
1579	//--------------------------------------------------------------------------------
1580	// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
1581	//
1582	// Arguments:
1583	// simdNode - The GT_SIMD node
1584	//
1585	// Notes:
1586	// This intrinsic takes two arguments. The first operand is narrowed to produce the
1587	// lower elements of the results, and the second operand produces the high elements.
1588	//
1589	void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
1590	{
1591	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
1592
1593	GenTree* op1 = simdNode->gtGetOp1();
1594	GenTree* op2 = simdNode->gtGetOp2();
1595	var_types baseType = simdNode->gtSIMDBaseType;
1596	regNumber targetReg = simdNode->gtRegNum;
1597	assert(targetReg != REG_NA);
1598	var_types simdType = simdNode->TypeGet();
1599	emitAttr emitSize = emitTypeSize(simdType);
1600	SIMDLevel level = compiler->getSIMDSupportLevel();
1601
1602	genConsumeOperands(simdNode);
1603	regNumber op1Reg = op1->gtRegNum;
1604	regNumber op2Reg = op2->gtRegNum;
1605	if (baseType == TYP_DOUBLE)
1606	{
1607	regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1608
1609	inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
1610	inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
1611	// Now insert the high-order result (in tmpReg) into the upper half of targetReg.
1612	if (level == SIMD_AVX2_Supported)
1613	{
1614	getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, `0x01`);
1615	}
1616	else
1617	{
1618	inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
1619	}
1620	}
1621	else if (varTypeIsLong(baseType))
1622	{
1623	if (level == SIMD_AVX2_Supported)
1624	{
1625	// We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
1626	// We will generate the following:
1627	// vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg)
1628	// vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2)
1629	// vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg)
1630	// mov tmpReg2, op1Reg
1631	// vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2)
1632	// pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg
1633	// pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg
1634	// punpcklqdq tgtReg, tmpReg
1635	regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1636	regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1637	getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, `0x01`);
1638	getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, `0x01`);
1639	getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, `0x01`);
1640	inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
1641	getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, `0x01`);
1642	getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
1643	getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
1644	inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
1645	}
1646	else
1647	{
1648	// We will generate the following:
1649	// pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
1650	// psrldq targetReg, 8 (shift them right to get zeros in the high elements)
1651	// pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements)
1652	// pslldq tmpReg, 8 (now shift these left to get zeros in the low elements)
1653	// por targetReg, tmpReg
1654	regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1655	instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1656	instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1657	emitAttr emitSize = emitTypeSize(simdType);
1658
1659	getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
1660	getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, `8`);
1661	getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
1662	getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, `8`);
1663	inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
1664	}
1665	}
1666	else
1667	{
1668	// We will generate the following:
1669	// mov targetReg, op1Reg
1670	// mov tmpReg, op2Reg
1671	// psll? targetReg, shiftCount
1672	// pslr? targetReg, shiftCount
1673	// psll? tmpReg, shiftCount
1674	// pslr? tmpReg, shiftCount
1675	// <pack> targetReg, tmpReg
1676	// Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
1677	// and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
1678	// get CLR type semantics; otherwise it will saturate).
1679	//
1680	int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / `2`);
1681	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1682	instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
1683	instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1684
1685	if (level == SIMD_AVX2_Supported)
1686	{
1687	regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1688	regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1689
1690	// The AVX instructions generally operate on "lanes", so we have to permute the
1691	// inputs so that the destination register has the low 128-bit halves of the two
1692	// inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
1693	getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, `0x20`);
1694	getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, `0x31`);
1695	getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
1696	getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
1697	getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1698	getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
1699	inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
1700	}
1701	else
1702	{
1703	regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1704
1705	inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1706	inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
1707
1708	instruction tmpShiftRight = shiftRightIns;
1709	if ((baseType == TYP_INT \|\| baseType == TYP_UINT) && level == SIMD_SSE2_Supported)
1710	{
1711	tmpShiftRight = INS_psrad;
1712	}
1713
1714	getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
1715	getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
1716	getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1717	getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
1718	inst_RV_RV(ins, targetReg, tmpReg, simdType);
1719	}
1720	}
1721	genProduceReg(simdNode);
1722	}
1723
1724	//--------------------------------------------------------------------------------
1725	// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
1726	// add, sub, mul, bit-wise And, AndNot and Or.
1727	//
1728	// Arguments:
1729	// simdNode - The GT_SIMD node
1730	//
1731	// Return Value:
1732	// None.
1733	//
1734	void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
1735	{
1736	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd \|\| simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub \|\|
1737	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul \|\| simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv \|\|
1738	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd \|\|
1739	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot \|\|
1740	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr \|\|
1741	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor \|\| simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin \|\|
1742	simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax);
1743
1744	GenTree* op1 = simdNode->gtGetOp1();
1745	GenTree* op2 = simdNode->gtGetOp2();
1746	var_types baseType = simdNode->gtSIMDBaseType;
1747	regNumber targetReg = simdNode->gtRegNum;
1748	assert(targetReg != REG_NA);
1749	var_types targetType = simdNode->TypeGet();
1750	SIMDLevel level = compiler->getSIMDSupportLevel();
1751
1752	genConsumeOperands(simdNode);
1753	regNumber op1Reg = op1->gtRegNum;
1754	regNumber op2Reg = op2->gtRegNum;
1755	regNumber otherReg = op2Reg;
1756
1757	// Vector<Int>.Mul:
1758	// SSE2 doesn't have an instruction to perform this operation directly
1759	// whereas SSE4.1 does (pmulld). This is special cased and computed
1760	// as follows.
1761	if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported)
1762	{
1763	// We need a temporary register that is NOT the same as the target,
1764	// and we MAY need another.
1765	regNumber tmpReg = simdNode->ExtractTempReg();
1766	regNumber tmpReg2 = simdNode->GetSingleTempReg();
1767
1768	// The register allocator guarantees the following conditions:
1769	// - the only registers that may be the same among op1Reg, op2Reg, tmpReg
1770	// and tmpReg2 are op1Reg and op2Reg.
1771	// Let's be extra-careful and assert that now.
1772	assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
1773	(tmpReg != tmpReg2));
1774
1775	// We will start by setting things up so that:
1776	// - We have op1 in op1Reg and targetReg, and they are different registers.
1777	// - We have op2 in op2Reg and tmpReg
1778	// - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
1779	// OR they are the targetReg that will be produced.
1780	// (Note that in the code we generate below op1Reg and op2Reg are never written.)
1781	// We will copy things as necessary to ensure that this is the case.
1782	// Note that we can swap op1 and op2, since multiplication is commutative.
1783	// We will not modify the values in op1Reg and op2Reg.
1784	// (Though note that if either op1 or op2 is the same as targetReg, we will make
1785	// a copy and use that copy as the input register. In that case we WILL modify
1786	// the original value in the register, but will wind up with the result in targetReg
1787	// in the end, as expected.)
1788
1789	// First, we need a tmpReg that is NOT the same as targetReg.
1790	// Note that if we have another reg that is the same as targetReg,
1791	// we can use tmpReg2 for that case, as we will not have hit this case.
1792	if (tmpReg == targetReg)
1793	{
1794	tmpReg = tmpReg2;
1795	}
1796
1797	if (op2Reg == targetReg)
1798	{
1799	// We will swap the operands.
1800	// Since the code below only deals with registers, this now becomes the case where
1801	// op1Reg == targetReg.
1802	op2Reg = op1Reg;
1803	op1Reg = targetReg;
1804	}
1805	if (op1Reg == targetReg)
1806	{
1807	// Copy op1, and make tmpReg2 the new op1Reg.
1808	// Note that those regs can't be the same, as we asserted above.
1809	// Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
1810	// the "tmpReg == targetReg" case.
1811	inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
1812	op1Reg = tmpReg2;
1813	inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1814	// However, we have one more case to worry about: what if op2Reg is also targetReg
1815	// (i.e. we have the same operand as op1 and op2)?
1816	// In that case we will set op2Reg to the same register as op1Reg.
1817	if (op2Reg == targetReg)
1818	{
1819	op2Reg = tmpReg2;
1820	}
1821	}
1822	else
1823	{
1824	// Copy op1 to targetReg and op2 to tmpReg.
1825	inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1826	inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1827	}
1828	// Let's assert that things are as we expect.
1829	// - We have op1 in op1Reg and targetReg, and they are different registers.
1830	assert(op1Reg != targetReg);
1831	// - We have op2 in op2Reg and tmpReg, and they are different registers.
1832	assert(op2Reg != tmpReg);
1833	// - Either we are going to leave op1's reg unmodified, or it is the targetReg.
1834	assert((op1->gtRegNum == op1Reg) \|\| (op1->gtRegNum == op2Reg) \|\| (op1->gtRegNum == targetReg));
1835	// - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
1836	assert((op2->gtRegNum == op1Reg) \|\| (op2->gtRegNum == op2Reg) \|\| (op2->gtRegNum == targetReg));
1837
1838	// Now we can generate the code.
1839
1840	// targetReg = op1 >> 4-bytes (op1 is already in targetReg)
1841	getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, `4`);
1842
1843	// tmpReg = op2 >> 4-bytes (op2 is already in tmpReg)
1844	getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, `4`);
1845
1846	// tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
1847	// tmpReg[63:0] = op1[1] op2[1]*
1848	// tmpReg[127:64] = op1[3] op2[3]*
1849	inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
1850
1851	// Extract first and third double word results from tmpReg
1852	// tmpReg = shuffle(0,0,2,0) of tmpReg
1853	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
1854
1855	// targetReg[63:0] = op1[0] op2[0]*
1856	// targetReg[127:64] = op1[2] op2[2]*
1857	inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1858	inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
1859
1860	// Extract first and third double word results from targetReg
1861	// targetReg = shuffle(0,0,2,0) of targetReg
1862	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
1863
1864	// pack the results into a single vector
1865	inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1866	}
1867	else
1868	{
1869	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1870
1871	// Currently AVX doesn't support integer.
1872	// if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
1873	if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
1874	!(ins == INS_cvtsi2ss \|\| ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
1875	{
1876	inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
1877	}
1878	else
1879	{
1880	if (op2Reg == targetReg)
1881	{
1882	otherReg = op1Reg;
1883	}
1884	else if (op1Reg != targetReg)
1885	{
1886	inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1887	}
1888
1889	inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1890	}
1891	}
1892
1893	// Vector2/3 div: since the top-most elements will be zero, we end up
1894	// perfoming 0/0 which is a NAN. Therefore, post division we need to set the
1895	// top-most elements to zero. This is achieved by left logical shift followed
1896	// by right logical shift of targetReg.
1897	if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < `16`))
1898	{
1899	// These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
1900	unsigned shiftCount = `16` - simdNode->gtSIMDSize;
1901	assert(shiftCount != `0`);
1902	instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1903	getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1904	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1905	getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1906	}
1907
1908	genProduceReg(simdNode);
1909	}
1910
1911	//--------------------------------------------------------------------------------
1912	// genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
1913	// <, <=, >, >= and ==
1914	//
1915	// Arguments:
1916	// simdNode - The GT_SIMD node
1917	//
1918	// Return Value:
1919	// None.
1920	//
1921	void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
1922	{
1923	GenTree* op1 = simdNode->gtGetOp1();
1924	GenTree* op2 = simdNode->gtGetOp2();
1925	var_types baseType = simdNode->gtSIMDBaseType;
1926	regNumber targetReg = simdNode->gtRegNum;
1927	var_types targetType = simdNode->TypeGet();
1928	SIMDLevel level = compiler->getSIMDSupportLevel();
1929
1930	genConsumeOperands(simdNode);
1931	regNumber op1Reg = op1->gtRegNum;
1932	regNumber op2Reg = op2->gtRegNum;
1933	regNumber otherReg = op2Reg;
1934
1935	switch (simdNode->gtSIMDIntrinsicID)
1936	{
1937	case SIMDIntrinsicEqual:
1938	case SIMDIntrinsicGreaterThan:
1939	{
1940	assert(targetReg != REG_NA);
1941
1942	#ifdef DEBUG
1943	// SSE2: vector<(u)long> relational op should be implemented in terms of
1944	// TYP_INT comparison operations
1945	if (baseType == TYP_LONG \|\| baseType == TYP_ULONG)
1946	{
1947	assert(level >= SIMD_SSE4_Supported);
1948	}
1949	#endif
1950
1951	// Greater-than: Floating point vectors use "<" with swapped operands
1952	if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
1953	{
1954	assert(!varTypeIsFloating(baseType));
1955	}
1956
1957	unsigned ival = `0`;
1958	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1959
1960	// targetReg = op1reg > op2reg
1961	// Therefore, we can optimize if op1Reg == targetReg
1962	otherReg = op2Reg;
1963	if (op1Reg != targetReg)
1964	{
1965	if (op2Reg == targetReg)
1966	{
1967	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
1968	otherReg = op1Reg;
1969	}
1970	else
1971	{
1972	inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1973	}
1974	}
1975
1976	if (varTypeIsFloating(baseType))
1977	{
1978	getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival);
1979	}
1980	else
1981	{
1982	inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1983	}
1984	}
1985	break;
1986
1987	case SIMDIntrinsicLessThan:
1988	case SIMDIntrinsicLessThanOrEqual:
1989	{
1990	assert(targetReg != REG_NA);
1991
1992	// Int vectors use ">" and ">=" with swapped operands
1993	assert(varTypeIsFloating(baseType));
1994
1995	// Get the instruction opcode for compare operation
1996	unsigned ival;
1997	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1998
1999	// targetReg = op1reg RelOp op2reg
2000	// Thefore, we can optimize if op1Reg == targetReg
2001	if (op1Reg != targetReg)
2002	{
2003	inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2004	}
2005
2006	getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
2007	}
2008	break;
2009
2010	// (In)Equality that produces bool result instead of a bit vector
2011	case SIMDIntrinsicOpEquality:
2012	case SIMDIntrinsicOpInEquality:
2013	{
2014	// We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC.
2015	assert(targetReg == REG_NA);
2016
2017	var_types simdType = op1->TypeGet();
2018	// TODO-1stClassStructs: Temporary to minimize asmDiffs
2019	if (simdType == TYP_DOUBLE)
2020	{
2021	simdType = TYP_SIMD8;
2022	}
2023
2024	// Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
2025	// since both the operands will be in XMM registers.
2026	if (simdType == TYP_SIMD12)
2027	{
2028	simdType = TYP_SIMD16;
2029	}
2030
2031	// On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest.
2032	if (op2->isContained())
2033	{
2034	assert((compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(`0`));
2035	inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
2036	}
2037	else
2038	{
2039	// We need one additional SIMD register to store the result of the SIMD compare.
2040	regNumber tmpReg1 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
2041
2042	// tmpReg1 = (op1Reg == op2Reg)
2043	// Call this value of tmpReg1 as 'compResult' for further reference below.
2044	regNumber otherReg = op2Reg;
2045	if (tmpReg1 != op2Reg)
2046	{
2047	if (tmpReg1 != op1Reg)
2048	{
2049	inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
2050	}
2051	}
2052	else
2053	{
2054	otherReg = op1Reg;
2055	}
2056
2057	// For all integer types we can use TYP_INT comparison.
2058	unsigned ival = `0`;
2059	instruction ins =
2060	getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
2061
2062	if (varTypeIsFloating(baseType))
2063	{
2064	getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
2065	}
2066	else
2067	{
2068	inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
2069	}
2070
2071	regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT);
2072	inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
2073	// There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
2074	// instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
2075	// subset of each component's ones/zeroes. In the end we need to know if the result is
2076	// "all ones" where the number of ones is given by the vector byte size, not by the
2077	// vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
2078	// for SSE registers we need to compare to 0x0000FFFF.
2079	// The SIMD12 case is handled specially, because we can't rely on the upper bytes being
2080	// zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
2081	// Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
2082	// that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
2083	// encoding instead of 83F8FF.
2084	ssize_t mask;
2085	if ((simdNode->gtFlags & GTF_SIMD12_OP) != `0`)
2086	{
2087	mask = `0x00000FFF`;
2088	getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
2089	}
2090	else if (emitActualTypeSize(simdType) == `32`)
2091	{
2092	mask = -`1`;
2093	}
2094	else
2095	{
2096	mask = `0x0000FFFF`;
2097	}
2098	getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
2099	}
2100	}
2101	break;
2102
2103	default:
2104	noway_assert(!"Unimplemented SIMD relational operation.");
2105	unreached();
2106	}
2107
2108	genProduceReg(simdNode);
2109	}
2110
2111	//--------------------------------------------------------------------------------
2112	// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
2113	//
2114	// Arguments:
2115	// simdNode - The GT_SIMD node
2116	//
2117	// Return Value:
2118	// None.
2119	//
2120	void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
2121	{
2122	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
2123
2124	GenTree* op1 = simdNode->gtGetOp1();
2125	GenTree* op2 = simdNode->gtGetOp2();
2126	var_types baseType = simdNode->gtSIMDBaseType;
2127	var_types simdType = op1->TypeGet();
2128	// TODO-1stClassStructs: Temporary to minimize asmDiffs
2129	if (simdType == TYP_DOUBLE)
2130	{
2131	simdType = TYP_SIMD8;
2132	}
2133	var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
2134	regNumber targetReg = simdNode->gtRegNum;
2135	assert(targetReg != REG_NA);
2136
2137	var_types targetType = simdNode->TypeGet();
2138	assert(targetType == baseType);
2139
2140	genConsumeOperands(simdNode);
2141	regNumber op1Reg = op1->gtRegNum;
2142	regNumber op2Reg = op2->gtRegNum;
2143	regNumber tmpReg1 = REG_NA;
2144	regNumber tmpReg2 = REG_NA;
2145
2146	SIMDLevel level = compiler->getSIMDSupportLevel();
2147
2148	// Dot product intrinsic is supported only on float/double vectors
2149	// and 32-byte int vectors on AVX.
2150	//
2151	// Float/Double Vectors:
2152	// For SSE, or AVX with 32-byte vectors, we need one additional Xmm register
2153	// different from targetReg as scratch. Note that if this is a TYP_SIMD16 or
2154	// smaller on AVX, then we don't need a tmpReg.
2155	//
2156	// 32-byte integer vector on AVX: we need two additional Xmm registers
2157	// different from targetReg as scratch.
2158	//
2159	// 16-byte integer vector on SSE4: we need one additional Xmm register
2160	// different from targetReg as scratch.
2161	if (varTypeIsFloating(baseType))
2162	{
2163	if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) \|\| (simdEvalType == TYP_SIMD32))
2164	{
2165	tmpReg1 = simdNode->GetSingleTempReg();
2166	assert(tmpReg1 != targetReg);
2167	}
2168	else
2169	{
2170	assert(simdNode->AvailableTempRegCount() == `0`);
2171	}
2172	}
2173	else
2174	{
2175	assert(baseType == TYP_INT);
2176	assert(level >= SIMD_SSE4_Supported);
2177
2178	if (level == SIMD_SSE4_Supported)
2179	{
2180	tmpReg1 = simdNode->GetSingleTempReg();
2181	}
2182	else
2183	{
2184	tmpReg1 = simdNode->ExtractTempReg();
2185	tmpReg2 = simdNode->GetSingleTempReg();
2186	}
2187	}
2188
2189	if (level == SIMD_SSE2_Supported)
2190	{
2191	// We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
2192	if (op1Reg == targetReg)
2193	{
2194	// Best case
2195	// nothing to do, we have registers in the right place
2196	}
2197	else if (op2Reg == targetReg)
2198	{
2199	op2Reg = op1Reg;
2200	}
2201	else
2202	{
2203	inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2204	}
2205
2206	// DotProduct(v1, v2)
2207	// Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
2208	if ((simdNode->gtFlags & GTF_SIMD12_OP) != `0`)
2209	{
2210	assert(baseType == TYP_FLOAT);
2211	// v0 = v1 v2*
2212	// tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2213	// // position
2214	// tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper
2215	// // bits
2216	// v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1)
2217	// tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2)
2218	// v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2)
2219	//
2220	inst_RV_RV(INS_mulps, targetReg, op2Reg);
2221	inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2222	inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
2223	inst_RV_RV(INS_addps, targetReg, tmpReg1);
2224	inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
2225	inst_RV_RV(INS_addps, targetReg, tmpReg1);
2226	}
2227	else if (baseType == TYP_FLOAT)
2228	{
2229	// v0 = v1 v2*
2230	// tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2231	// // position
2232	// tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1)
2233	// v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
2234	// tmp = v0
2235	// tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2)
2236	// v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
2237	// // Essentially horizontal addition of all elements.
2238	// // We could achieve the same using SSEv3 instruction
2239	// // HADDPS.
2240	//
2241	inst_RV_RV(INS_mulps, targetReg, op2Reg);
2242	inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2243	inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
2244	inst_RV_RV(INS_addps, targetReg, tmpReg1);
2245	inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2246	inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
2247	inst_RV_RV(INS_addps, targetReg, tmpReg1);
2248	}
2249	else
2250	{
2251	assert(baseType == TYP_DOUBLE);
2252
2253	// v0 = v1 v2*
2254	// tmp = v0 // v0 = (1, 0) - each element is given by its position
2255	// tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
2256	// v0 = v0 + tmp // v0 = (1+0, 0+1)
2257	inst_RV_RV(INS_mulpd, targetReg, op2Reg);
2258	inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2259	inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, `0x01`);
2260	inst_RV_RV(INS_addpd, targetReg, tmpReg1);
2261	}
2262	}
2263	else
2264	{
2265	assert(level >= SIMD_SSE4_Supported);
2266
2267	if (varTypeIsFloating(baseType))
2268	{
2269	// We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
2270	// Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
2271	// use the 3-op form, so that we can avoid these copies.
2272	// TODO-CQ: Add inst_RV_RV_RV_IV().
2273	if (op1Reg == targetReg)
2274	{
2275	// Best case
2276	// nothing to do, we have registers in the right place
2277	}
2278	else if (op2Reg == targetReg)
2279	{
2280	op2Reg = op1Reg;
2281	}
2282	else
2283	{
2284	inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2285	}
2286
2287	emitAttr emitSize = emitActualTypeSize(simdEvalType);
2288	if (baseType == TYP_FLOAT)
2289	{
2290	// dpps computes the dot product of the upper & lower halves of the 32-byte register.
2291	// Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2292	unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != `0`) ? `0x71` : `0xf1`;
2293	inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
2294	// dpps computes the dot product of the upper & lower halves of the 32-byte register.
2295	// Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2296	// If this is TYP_SIMD32, we need to combine the lower & upper results.
2297	if (simdEvalType == TYP_SIMD32)
2298	{
2299	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, `0x01`);
2300	inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2301	}
2302	}
2303	else if (baseType == TYP_DOUBLE)
2304	{
2305	if (simdEvalType == TYP_SIMD32)
2306	{
2307	// targetReg = targetReg op2Reg*
2308	// targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
2309	// tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
2310	// targetReg = targetReg + tmpReg1
2311	inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
2312	inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
2313	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, `0x01`);
2314	inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2315	}
2316	else
2317	{
2318	// On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
2319	// dppd directly.
2320	assert(level == SIMD_SSE4_Supported);
2321	inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, `0x31`);
2322	}
2323	}
2324	}
2325	else
2326	{
2327	// Dot product of 32-byte int vector on SSE4/AVX.
2328	assert(baseType == TYP_INT);
2329	assert(simdEvalType == TYP_SIMD16 \|\| simdEvalType == TYP_SIMD32);
2330
2331	#ifdef DEBUG
2332	// SSE4: We need 1 scratch register.
2333	// AVX2: We need 2 scratch registers.
2334	if (simdEvalType == TYP_SIMD16)
2335	{
2336	assert(tmpReg1 != REG_NA);
2337	}
2338	else
2339	{
2340	assert(tmpReg1 != REG_NA);
2341	assert(tmpReg2 != REG_NA);
2342	}
2343	#endif
2344
2345	// tmpReg1 = op1 op2*
2346	if (level == SIMD_AVX2_Supported)
2347	{
2348	// On AVX take advantage 3 operand form of pmulld
2349	inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType));
2350	}
2351	else
2352	{
2353	inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType);
2354	inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType);
2355	}
2356
2357	if (simdEvalType == TYP_SIMD32)
2358	{
2359	// tmpReg2[127..0] = Upper 128-bits of tmpReg1
2360	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, `0x01`);
2361
2362	// tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0]
2363	// This will compute
2364	// tmpReg1[0] = op1[0]op2[0] + op1[4]op2[4]
2365	// tmpReg1[1] = op1[1]op2[1] + op1[5]op2[5]
2366	// tmpReg1[2] = op1[2]op2[2] + op1[6]op2[6]
2367	// tmpReg1[4] = op1[4]op2[4] + op1[7]op2[7]
2368	inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE);
2369	}
2370
2371	// This horizontal add will compute
2372	//
2373	// TYP_SIMD16:
2374	// tmpReg1[0] = tmpReg1[2] = op1[0]op2[0] + op1[1]op2[1]
2375	// tmpReg1[1] = tmpReg1[3] = op1[2]op2[2] + op1[4]op2[4]
2376	//
2377	// TYP_SIMD32:
2378	// tmpReg1[0] = tmpReg1[2] = op1[0]op2[0] + op1[4]op2[4] + op1[1]op2[1] + op1[5]op2[5]
2379	// tmpReg1[1] = tmpReg1[3] = op1[2]op2[2] + op1[6]op2[6] + op1[4]op2[4] + op1[7]op2[7]
2380	inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2381
2382	// DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1]
2383	inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2384
2385	// TargetReg = integer result from tmpReg1
2386	// (Note that for mov_xmm2i, the int register is always in the reg2 position)
2387	inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
2388	}
2389	}
2390
2391	genProduceReg(simdNode);
2392	}
2393
2394	//------------------------------------------------------------------------------------
2395	// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
2396	//
2397	// Arguments:
2398	// simdNode - The GT_SIMD node
2399	//
2400	// Return Value:
2401	// None.
2402	//
2403	void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
2404	{
2405	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
2406
2407	GenTree* op1 = simdNode->gtGetOp1();
2408	GenTree* op2 = simdNode->gtGetOp2();
2409	var_types simdType = op1->TypeGet();
2410	assert(varTypeIsSIMD(simdType));
2411
2412	// op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
2413	// since it is in XMM register.
2414	if (simdType == TYP_SIMD12)
2415	{
2416	simdType = TYP_SIMD16;
2417	}
2418
2419	var_types baseType = simdNode->gtSIMDBaseType;
2420	regNumber targetReg = simdNode->gtRegNum;
2421	assert(targetReg != REG_NA);
2422	var_types targetType = simdNode->TypeGet();
2423	assert(targetType == genActualType(baseType));
2424
2425	// GetItem has 2 operands:
2426	// - the source of SIMD type (op1)
2427	// - the index of the value to be returned.
2428	genConsumeOperands(simdNode);
2429	regNumber srcReg = op1->gtRegNum;
2430
2431	// Optimize the case of op1 is in memory and trying to access ith element.
2432	if (!op1->isUsedFromReg())
2433	{
2434	assert(op1->isContained());
2435
2436	regNumber baseReg;
2437	regNumber indexReg;
2438	int offset = `0`;
2439
2440	if (op1->OperIsLocal())
2441	{
2442	// There are three parts to the total offset here:
2443	// {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}.
2444	bool isEBPbased;
2445	unsigned varNum = op1->gtLclVarCommon.gtLclNum;
2446	offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
2447	if (op1->OperGet() == GT_LCL_FLD)
2448	{
2449	offset += op1->gtLclFld.gtLclOffs;
2450	}
2451	baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
2452	}
2453	else
2454	{
2455	// Require GT_IND addr to be not contained.
2456	assert(op1->OperGet() == GT_IND);
2457
2458	GenTree* addr = op1->AsIndir()->Addr();
2459	assert(!addr->isContained());
2460	baseReg = addr->gtRegNum;
2461	}
2462
2463	if (op2->isContainedIntOrIImmed())
2464	{
2465	indexReg = REG_NA;
2466	offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType);
2467	}
2468	else
2469	{
2470	indexReg = op2->gtRegNum;
2471	assert(genIsValidIntReg(indexReg));
2472	}
2473
2474	// Now, load the desired element.
2475	getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2476	emitTypeSize(baseType), // Of the vector baseType
2477	targetReg, // To targetReg
2478	baseReg, // Base Reg
2479	indexReg, // Indexed
2480	genTypeSize(baseType), // by the size of the baseType
2481	offset);
2482	genProduceReg(simdNode);
2483	return;
2484	}
2485
2486	// SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
2487	// For the non-constant case, we will use the SIMD temp location to store the vector, and
2488	// the load the desired element.
2489	// The range check will already have been performed, so at this point we know we have an index
2490	// within the bounds of the vector.
2491	if (!op2->IsCnsIntOrI())
2492	{
2493	unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
2494	noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
2495	bool isEBPbased;
2496	unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
2497	regNumber indexReg = op2->gtRegNum;
2498
2499	// Store the vector to the temp location.
2500	getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
2501	emitTypeSize(simdType), srcReg, simdInitTempVarNum, `0`);
2502
2503	// Now, load the desired element.
2504	getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2505	emitTypeSize(baseType), // Of the vector baseType
2506	targetReg, // To targetReg
2507	(isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
2508	indexReg, // Indexed
2509	genTypeSize(baseType), // by the size of the baseType
2510	offs);
2511	genProduceReg(simdNode);
2512	return;
2513	}
2514
2515	noway_assert(op2->isContained());
2516	noway_assert(op2->IsCnsIntOrI());
2517	unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal;
2518	unsigned int byteShiftCnt = index * genTypeSize(baseType);
2519
2520	// In general we shouldn't have an index greater than or equal to the length of the vector.
2521	// However, if we have an out-of-range access, under minOpts it will not be optimized
2522	// away. The code will throw before we reach this point, but we still need to generate
2523	// code. In that case, we will simply mask off the upper bits.
2524	if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
2525	{
2526	byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - `1`);
2527	index = byteShiftCnt / genTypeSize(baseType);
2528	}
2529
2530	regNumber tmpReg = REG_NA;
2531	if (simdNode->AvailableTempRegCount() != `0`)
2532	{
2533	tmpReg = simdNode->GetSingleTempReg();
2534	}
2535	else
2536	{
2537	assert((byteShiftCnt == `0`) \|\| varTypeIsFloating(baseType) \|\|
2538	(varTypeIsSmallInt(baseType) && (byteShiftCnt < `16`)));
2539	}
2540
2541	if (byteShiftCnt >= `16`)
2542	{
2543	assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2544	byteShiftCnt -= `16`;
2545	regNumber newSrcReg;
2546	if (varTypeIsFloating(baseType))
2547	{
2548	newSrcReg = targetReg;
2549	}
2550	else
2551	{
2552	// Integer types
2553	assert(tmpReg != REG_NA);
2554	newSrcReg = tmpReg;
2555	}
2556	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, `0x01`);
2557
2558	srcReg = newSrcReg;
2559	}
2560
2561	// Generate the following sequence:
2562	// 1) baseType is floating point
2563	// movaps targetReg, srcReg
2564	// psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
2565	//
2566	// 2) baseType is not floating point
2567	// movaps tmpReg, srcReg <-- not generated if accessing zero'th element
2568	// OR if tmpReg == srcReg
2569	// psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
2570	// mov_xmm2i targetReg, tmpReg
2571	if (varTypeIsFloating(baseType))
2572	{
2573	if (targetReg != srcReg)
2574	{
2575	inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
2576	}
2577
2578	if (byteShiftCnt != `0`)
2579	{
2580	instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2581	getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
2582	}
2583	}
2584	else
2585	{
2586	if (varTypeIsSmallInt(baseType))
2587	{
2588	// Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
2589	// In case of vector<short> we also need to sign extend the 16-bit value in targetReg
2590	// Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
2591	// by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
2592
2593	unsigned baseSize = genTypeSize(baseType);
2594	if (baseSize == `1`)
2595	{
2596	index /= `2`;
2597	}
2598	// We actually want index % 8 for the AVX case (for SSE it will never be > 8).
2599	// Note that this doesn't matter functionally, because the instruction uses just the
2600	// low 3 bits of index, but it's better to use the right value.
2601	if (index > `8`)
2602	{
2603	assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2604	index -= `8`;
2605	}
2606
2607	getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
2608
2609	bool ZeroOrSignExtnReqd = true;
2610	if (baseSize == `1`)
2611	{
2612	if ((op2->gtIntCon.gtIconVal % `2`) == `1`)
2613	{
2614	// Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
2615	inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, `8`);
2616
2617	// Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
2618	ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2619	}
2620	// else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
2621	}
2622	else
2623	{
2624	// Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
2625	assert(baseSize == `2`);
2626	ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2627	}
2628
2629	if (ZeroOrSignExtnReqd)
2630	{
2631	// Zero/sign extend the byte/short to 32-bits
2632	inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
2633	}
2634	}
2635	else
2636	{
2637	// We need a temp xmm register if the baseType is not floating point and
2638	// accessing non-zero'th element.
2639	instruction ins;
2640
2641	if (byteShiftCnt != `0`)
2642	{
2643	assert(tmpReg != REG_NA);
2644
2645	if (tmpReg != srcReg)
2646	{
2647	inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
2648	}
2649
2650	ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2651	getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
2652	}
2653	else
2654	{
2655	tmpReg = srcReg;
2656	}
2657
2658	assert(tmpReg != REG_NA);
2659	ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
2660	// (Note that for mov_xmm2i, the int register is always in the reg2 position.)
2661	inst_RV_RV(ins, tmpReg, targetReg, baseType);
2662	}
2663	}
2664
2665	genProduceReg(simdNode);
2666	}
2667
2668	//------------------------------------------------------------------------------------
2669	// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
2670	//
2671	// Arguments:
2672	// simdNode - The GT_SIMD node
2673	//
2674	// Return Value:
2675	// None.
2676	//
2677	// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
2678	//
2679	void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
2680	{
2681	// Determine index based on intrinsic ID
2682	int index = -`1`;
2683	switch (simdNode->gtSIMDIntrinsicID)
2684	{
2685	case SIMDIntrinsicSetX:
2686	index = `0`;
2687	break;
2688	case SIMDIntrinsicSetY:
2689	index = `1`;
2690	break;
2691	case SIMDIntrinsicSetZ:
2692	index = `2`;
2693	break;
2694	case SIMDIntrinsicSetW:
2695	index = `3`;
2696	break;
2697
2698	default:
2699	unreached();
2700	}
2701	assert(index != -`1`);
2702
2703	// op1 is the SIMD vector
2704	// op2 is the value to be set
2705	GenTree* op1 = simdNode->gtGetOp1();
2706	GenTree* op2 = simdNode->gtGetOp2();
2707
2708	var_types baseType = simdNode->gtSIMDBaseType;
2709	regNumber targetReg = simdNode->gtRegNum;
2710	assert(targetReg != REG_NA);
2711	var_types targetType = simdNode->TypeGet();
2712	assert(varTypeIsSIMD(targetType));
2713
2714	// the following assert must hold.
2715	// supported only on vector2f/3f/4f right now
2716	noway_assert(baseType == TYP_FLOAT);
2717	assert(op2->TypeGet() == baseType);
2718	assert(simdNode->gtSIMDSize >= ((index + `1`) * genTypeSize(baseType)));
2719
2720	genConsumeOperands(simdNode);
2721	regNumber op1Reg = op1->gtRegNum;
2722	regNumber op2Reg = op2->gtRegNum;
2723
2724	// TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
2725	if (targetReg != op1Reg)
2726	{
2727	inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2728	}
2729
2730	// Right now this intrinsic is supported only for float base type vectors.
2731	// If in future need to support on other base type vectors, the below
2732	// logic needs modification.
2733	noway_assert(baseType == TYP_FLOAT);
2734
2735	if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2736	{
2737	// We need one additional int register as scratch
2738	regNumber tmpReg = simdNode->GetSingleTempReg();
2739	assert(genIsValidIntReg(tmpReg));
2740
2741	// Move the value from xmm reg to an int reg
2742	instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
2743	// (Note that for mov_xmm2i, the int register is always in the reg2 position.
2744	inst_RV_RV(ins, op2Reg, tmpReg, baseType);
2745
2746	// First insert the lower 16-bits of tmpReg in targetReg at 2index position*
2747	// since every float has two 16-bit words.
2748	getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, `2` * index);
2749
2750	// Logical right shift tmpReg by 16-bits and insert in targetReg at 2index + 1 position*
2751	inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, `16`);
2752	getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, `2` * index + `1`);
2753	}
2754	else
2755	{
2756	unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(`0`) \| INSERTPS_TARGET_SELECT(index));
2757	inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
2758	}
2759
2760	genProduceReg(simdNode);
2761	}
2762
2763	//------------------------------------------------------------------------
2764	// genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
2765	//
2766	// Arguments:
2767	// simdNode - The GT_SIMD node
2768	//
2769	// Return Value:
2770	// None.
2771	//
2772	void CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
2773	{
2774	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
2775	noway_assert(compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported);
2776
2777	GenTree* op1 = simdNode->gtGetOp1();
2778	GenTree* op2 = simdNode->gtGetOp2();
2779	assert(op2->isContained());
2780	assert(op2->IsCnsIntOrI());
2781	int shuffleControl = (int)op2->AsIntConCommon()->IconValue();
2782	var_types baseType = simdNode->gtSIMDBaseType;
2783	var_types targetType = simdNode->TypeGet();
2784	regNumber targetReg = simdNode->gtRegNum;
2785	assert(targetReg != REG_NA);
2786
2787	regNumber op1Reg = genConsumeReg(op1);
2788	if (targetReg != op1Reg)
2789	{
2790	inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2791	}
2792
2793	instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
2794	getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
2795	genProduceReg(simdNode);
2796	}
2797
2798	//-----------------------------------------------------------------------------
2799	// genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
2800	// Since Vector3 is not a hardware supported write size, it is performed
2801	// as two writes: 8 byte followed by 4-byte.
2802	//
2803	// Arguments:
2804	// treeNode - tree node that is attempting to store indirect
2805	//
2806	//
2807	// Return Value:
2808	// None.
2809	//
2810	void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
2811	{
2812	assert(treeNode->OperGet() == GT_STOREIND);
2813
2814	GenTree* addr = treeNode->gtOp.gtOp1;
2815	GenTree* data = treeNode->gtOp.gtOp2;
2816
2817	// addr and data should not be contained.
2818	assert(!data->isContained());
2819	assert(!addr->isContained());
2820
2821	#ifdef DEBUG
2822	// Should not require a write barrier
2823	GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
2824	assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
2825	#endif
2826
2827	// Need an addtional Xmm register to extract upper 4 bytes from data.
2828	regNumber tmpReg = treeNode->GetSingleTempReg();
2829
2830	genConsumeOperands(treeNode->AsOp());
2831
2832	// 8-byte write
2833	getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, `0`);
2834
2835	// Extract upper 4-bytes from data
2836	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, `0x02`);
2837
2838	// 4-byte write
2839	getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, `8`);
2840	}
2841
2842	//-----------------------------------------------------------------------------
2843	// genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
2844	// Since Vector3 is not a hardware supported write size, it is performed
2845	// as two loads: 8 byte followed by 4-byte.
2846	//
2847	// Arguments:
2848	// treeNode - tree node of GT_IND
2849	//
2850	//
2851	// Return Value:
2852	// None.
2853	//
2854	void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
2855	{
2856	assert(treeNode->OperGet() == GT_IND);
2857
2858	regNumber targetReg = treeNode->gtRegNum;
2859	GenTree* op1 = treeNode->gtOp.gtOp1;
2860	assert(!op1->isContained());
2861	regNumber operandReg = genConsumeReg(op1);
2862
2863	// Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
2864	regNumber tmpReg = treeNode->GetSingleTempReg();
2865	assert(tmpReg != targetReg);
2866
2867	// Load upper 4 bytes in tmpReg
2868	getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, `8`);
2869
2870	// Load lower 8 bytes in targetReg
2871	getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, `0`);
2872
2873	// combine upper 4 bytes and lower 8 bytes in targetReg
2874	getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2875
2876	genProduceReg(treeNode);
2877	}
2878
2879	//-----------------------------------------------------------------------------
2880	// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2881	// Since Vector3 is not a hardware supported write size, it is performed
2882	// as two stores: 8 byte followed by 4-byte.
2883	//
2884	// Arguments:
2885	// treeNode - tree node that is attempting to store TYP_SIMD12 field
2886	//
2887	// Return Value:
2888	// None.
2889	//
2890	void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
2891	{
2892	assert((treeNode->OperGet() == GT_STORE_LCL_FLD) \|\| (treeNode->OperGet() == GT_STORE_LCL_VAR));
2893
2894	unsigned offs = `0`;
2895	unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2896	assert(varNum < compiler->lvaCount);
2897
2898	if (treeNode->OperGet() == GT_LCL_FLD)
2899	{
2900	offs = treeNode->gtLclFld.gtLclOffs;
2901	}
2902
2903	GenTree* op1 = treeNode->gtOp.gtOp1;
2904	assert(!op1->isContained());
2905	regNumber operandReg = genConsumeReg(op1);
2906
2907	// Need an addtional Xmm register to extract upper 4 bytes from data.
2908	regNumber tmpReg = treeNode->GetSingleTempReg();
2909
2910	// store lower 8 bytes
2911	getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
2912
2913	// Extract upper 4-bytes from operandReg
2914	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, `0x02`);
2915
2916	// Store upper 4 bytes
2917	getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs + `8`);
2918	}
2919
2920	//-----------------------------------------------------------------------------
2921	// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
2922	// Since Vector3 is not a hardware supported read size, it is performed
2923	// as two reads: 4 byte followed by 8 byte.
2924	//
2925	// Arguments:
2926	// treeNode - tree node that is attempting to load TYP_SIMD12 field
2927	//
2928	// Return Value:
2929	// None.
2930	//
2931	void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
2932	{
2933	assert((treeNode->OperGet() == GT_LCL_FLD) \|\| (treeNode->OperGet() == GT_LCL_VAR));
2934
2935	regNumber targetReg = treeNode->gtRegNum;
2936	unsigned offs = `0`;
2937	unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2938	assert(varNum < compiler->lvaCount);
2939
2940	if (treeNode->OperGet() == GT_LCL_FLD)
2941	{
2942	offs = treeNode->gtLclFld.gtLclOffs;
2943	}
2944
2945	// Need an additional Xmm register that is different from targetReg to read upper 4 bytes.
2946	regNumber tmpReg = treeNode->GetSingleTempReg();
2947	assert(tmpReg != targetReg);
2948
2949	// Read upper 4 bytes to tmpReg
2950	getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs + `8`);
2951
2952	// Read lower 8 bytes to targetReg
2953	getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
2954
2955	// combine upper 4 bytes and lower 8 bytes in targetReg
2956	getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2957
2958	genProduceReg(treeNode);
2959	}
2960
2961	#ifdef _TARGET_X86_
2962
2963	//-----------------------------------------------------------------------------
2964	// genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
2965	// Since Vector3 is not a hardware supported write size, it is performed
2966	// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2967	// already been adjusted.
2968	//
2969	// Arguments:
2970	// operandReg - the xmm register containing the SIMD12 to store.
2971	// tmpReg - an xmm register that can be used as a temporary for the operation.
2972	//
2973	// Return Value:
2974	// None.
2975	//
2976	void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
2977	{
2978	assert(genIsValidFloatReg(operandReg));
2979	assert(genIsValidFloatReg(tmpReg));
2980
2981	// 8-byte write
2982	getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, `0`);
2983
2984	// Extract upper 4-bytes from data
2985	getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, `0x02`);
2986
2987	// 4-byte write
2988	getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, `8`);
2989	}
2990
2991	//-----------------------------------------------------------------------------
2992	// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2993	// Since Vector3 is not a hardware supported write size, it is performed
2994	// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2995	// already been adjusted.
2996	//
2997	// Arguments:
2998	// treeNode - tree node that is attempting to store TYP_SIMD12 field
2999	//
3000	// Return Value:
3001	// None.
3002	//
3003	void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
3004	{
3005	assert(treeNode->OperGet() == GT_PUTARG_STK);
3006
3007	GenTree* op1 = treeNode->gtOp.gtOp1;
3008	assert(!op1->isContained());
3009	regNumber operandReg = genConsumeReg(op1);
3010
3011	// Need an addtional Xmm register to extract upper 4 bytes from data.
3012	regNumber tmpReg = treeNode->GetSingleTempReg();
3013
3014	genStoreSIMD12ToStack(operandReg, tmpReg);
3015	}
3016
3017	#endif // _TARGET_X86_
3018
3019	//-----------------------------------------------------------------------------
3020	// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
3021	// the given register, if any, or to memory.
3022	//
3023	// Arguments:
3024	// simdNode - The GT_SIMD node
3025	//
3026	// Return Value:
3027	// None.
3028	//
3029	// Notes:
3030	// The upper half of all AVX registers is volatile, even the callee-save registers.
3031	// When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
3032	// to cause the upper half to be saved. It will first attempt to find another, unused, callee-save
3033	// register. If such a register cannot be found, it will save it to an available caller-save register.
3034	// In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
3035	// value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte
3036	// value will be spilled to the stack.)
3037	//
3038	void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
3039	{
3040	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
3041
3042	GenTree* op1 = simdNode->gtGetOp1();
3043	assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3044	regNumber targetReg = simdNode->gtRegNum;
3045	regNumber op1Reg = genConsumeReg(op1);
3046	assert(op1Reg != REG_NA);
3047	assert(targetReg != REG_NA);
3048	getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, `0x01`);
3049
3050	genProduceReg(simdNode);
3051	}
3052
3053	//-----------------------------------------------------------------------------
3054	// genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
3055	// the given register, if any, or to memory.
3056	//
3057	// Arguments:
3058	// simdNode - The GT_SIMD node
3059	//
3060	// Return Value:
3061	// None.
3062	//
3063	// Notes:
3064	// For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
3065	// have their home register, this node has its targetReg on the lclVar child, and its source
3066	// on the simdNode.
3067	// Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled
3068	// an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike
3069	// most spill scenarios, the saved tree will be different from the restored tree, but the spill
3070	// restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
3071	// spilled tree (saveNode) in order to perform the reload. We can easily find that tree,
3072	// as it is in the spill descriptor for the register from which it was saved.
3073	//
3074	void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
3075	{
3076	assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
3077
3078	GenTree* op1 = simdNode->gtGetOp1();
3079	assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3080	regNumber srcReg = simdNode->gtRegNum;
3081	regNumber lclVarReg = genConsumeReg(op1);
3082	unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
3083	assert(lclVarReg != REG_NA);
3084	assert(srcReg != REG_NA);
3085	if (simdNode->gtFlags & GTF_SPILLED)
3086	{
3087	GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
3088	noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
3089	genConsumeReg(saveNode);
3090	}
3091	getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, `0x01`);
3092	}
3093
3094	//------------------------------------------------------------------------
3095	// genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
3096	// routine which in turn calls appropriate genSIMDIntrinsicXXX() routine.
3097	//
3098	// Arguments:
3099	// simdNode - The GT_SIMD node
3100	//
3101	// Return Value:
3102	// None.
3103	//
3104	// Notes:
3105	// Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
3106	// a limited set of methods.
3107	//
3108	void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
3109	{
3110	// NYI for unsupported base types
3111	if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG &&
3112	simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE &&
3113	simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE &&
3114	simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE &&
3115	simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG)
3116	{
3117	noway_assert(!"SIMD intrinsic with unsupported base type.");
3118	}
3119
3120	switch (simdNode->gtSIMDIntrinsicID)
3121	{
3122	case SIMDIntrinsicInit:
3123	genSIMDIntrinsicInit(simdNode);
3124	break;
3125
3126	case SIMDIntrinsicInitN:
3127	genSIMDIntrinsicInitN(simdNode);
3128	break;
3129
3130	case SIMDIntrinsicSqrt:
3131	case SIMDIntrinsicCast:
3132	case SIMDIntrinsicAbs:
3133	genSIMDIntrinsicUnOp(simdNode);
3134	break;
3135
3136	case SIMDIntrinsicConvertToSingle:
3137	case SIMDIntrinsicConvertToInt32:
3138	genSIMDIntrinsic32BitConvert(simdNode);
3139	break;
3140
3141	case SIMDIntrinsicConvertToDouble:
3142	case SIMDIntrinsicConvertToInt64:
3143	genSIMDIntrinsic64BitConvert(simdNode);
3144	break;
3145
3146	case SIMDIntrinsicWidenLo:
3147	case SIMDIntrinsicWidenHi:
3148	genSIMDIntrinsicWiden(simdNode);
3149	break;
3150
3151	case SIMDIntrinsicNarrow:
3152	genSIMDIntrinsicNarrow(simdNode);
3153	break;
3154
3155	case SIMDIntrinsicAdd:
3156	case SIMDIntrinsicSub:
3157	case SIMDIntrinsicMul:
3158	case SIMDIntrinsicDiv:
3159	case SIMDIntrinsicBitwiseAnd:
3160	case SIMDIntrinsicBitwiseAndNot:
3161	case SIMDIntrinsicBitwiseOr:
3162	case SIMDIntrinsicBitwiseXor:
3163	case SIMDIntrinsicMin:
3164	case SIMDIntrinsicMax:
3165	genSIMDIntrinsicBinOp(simdNode);
3166	break;
3167
3168	case SIMDIntrinsicOpEquality:
3169	case SIMDIntrinsicOpInEquality:
3170	case SIMDIntrinsicEqual:
3171	case SIMDIntrinsicLessThan:
3172	case SIMDIntrinsicGreaterThan:
3173	case SIMDIntrinsicLessThanOrEqual:
3174	case SIMDIntrinsicGreaterThanOrEqual:
3175	genSIMDIntrinsicRelOp(simdNode);
3176	break;
3177
3178	case SIMDIntrinsicDotProduct:
3179	genSIMDIntrinsicDotProduct(simdNode);
3180	break;
3181
3182	case SIMDIntrinsicGetItem:
3183	genSIMDIntrinsicGetItem(simdNode);
3184	break;
3185
3186	case SIMDIntrinsicShuffleSSE2:
3187	genSIMDIntrinsicShuffleSSE2(simdNode);
3188	break;
3189
3190	case SIMDIntrinsicSetX:
3191	case SIMDIntrinsicSetY:
3192	case SIMDIntrinsicSetZ:
3193	case SIMDIntrinsicSetW:
3194	genSIMDIntrinsicSetItem(simdNode);
3195	break;
3196
3197	case SIMDIntrinsicUpperSave:
3198	genSIMDIntrinsicUpperSave(simdNode);
3199	break;
3200	case SIMDIntrinsicUpperRestore:
3201	genSIMDIntrinsicUpperRestore(simdNode);
3202	break;
3203
3204	default:
3205	noway_assert(!"Unimplemented SIMD intrinsic.");
3206	unreached();
3207	}
3208	}
3209
3210	#endif // FEATURE_SIMD
3211	#endif //_TARGET_XARCH_
3212

Browse the source code of CoreCLR/jit/simdcodegenxarch.cpp