1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | |
5 | /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
7 | XX XX |
8 | XX Amd64 SIMD Code Generator XX |
9 | XX XX |
10 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
11 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
12 | */ |
13 | #include "jitpch.h" |
14 | #ifdef _MSC_VER |
15 | #pragma hdrstop |
16 | #endif |
17 | |
18 | #ifdef _TARGET_XARCH_ |
19 | #ifdef FEATURE_SIMD |
20 | |
21 | #include "emit.h" |
22 | #include "codegen.h" |
23 | #include "sideeffects.h" |
24 | #include "lower.h" |
25 | #include "gcinfo.h" |
26 | #include "gcinfoencoder.h" |
27 | |
28 | // Instruction immediates |
29 | |
30 | // Insertps: |
31 | // - bits 6 and 7 of the immediate indicate which source item to select (0..3) |
32 | // - bits 4 and 5 of the immediate indicate which target item to insert into (0..3) |
33 | // - bits 0 to 3 of the immediate indicate which target item to zero |
34 | #define INSERTPS_SOURCE_SELECT(i) (i << 6) |
35 | #define INSERTPS_TARGET_SELECT(i) (i << 4) |
36 | #define INSERTPS_ZERO(i) (1 << i) |
37 | |
38 | // getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic |
39 | // |
40 | // Arguments: |
41 | // intrinsicId - SIMD intrinsic Id |
42 | // baseType - Base type of the SIMD vector |
43 | // immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode |
44 | // |
45 | // |
46 | // Return Value: |
47 | // Instruction (op) to be used, and immed is set if instruction requires an immediate operand. |
48 | // |
49 | instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/) |
50 | { |
51 | // Minimal required instruction set is SSE2. |
52 | assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported); |
53 | |
54 | instruction result = INS_invalid; |
55 | switch (intrinsicId) |
56 | { |
57 | case SIMDIntrinsicInit: |
58 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
59 | { |
60 | // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory. |
61 | // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg. |
62 | // If we decide to use AVX2 only, we can remove this assert. |
63 | if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2)) |
64 | { |
65 | assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE); |
66 | } |
67 | switch (baseType) |
68 | { |
69 | case TYP_FLOAT: |
70 | result = INS_vbroadcastss; |
71 | break; |
72 | case TYP_DOUBLE: |
73 | result = INS_vbroadcastsd; |
74 | break; |
75 | case TYP_ULONG: |
76 | case TYP_LONG: |
77 | // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed |
78 | // to be TYP_LONG reg. |
79 | result = INS_vpbroadcastq; |
80 | break; |
81 | case TYP_UINT: |
82 | case TYP_INT: |
83 | result = INS_vpbroadcastd; |
84 | break; |
85 | case TYP_USHORT: |
86 | case TYP_SHORT: |
87 | result = INS_vpbroadcastw; |
88 | break; |
89 | case TYP_UBYTE: |
90 | case TYP_BYTE: |
91 | result = INS_vpbroadcastb; |
92 | break; |
93 | default: |
94 | unreached(); |
95 | } |
96 | break; |
97 | } |
98 | |
99 | // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic. |
100 | __fallthrough; |
101 | |
102 | case SIMDIntrinsicShuffleSSE2: |
103 | if (baseType == TYP_FLOAT) |
104 | { |
105 | result = INS_shufps; |
106 | } |
107 | else if (baseType == TYP_DOUBLE) |
108 | { |
109 | result = INS_shufpd; |
110 | } |
111 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
112 | { |
113 | result = INS_pshufd; |
114 | } |
115 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
116 | { |
117 | // We don't have a separate SSE2 instruction and will |
118 | // use the instruction meant for doubles since it is |
119 | // of the same size as a long. |
120 | result = INS_shufpd; |
121 | } |
122 | break; |
123 | |
124 | case SIMDIntrinsicSqrt: |
125 | if (baseType == TYP_FLOAT) |
126 | { |
127 | result = INS_sqrtps; |
128 | } |
129 | else if (baseType == TYP_DOUBLE) |
130 | { |
131 | result = INS_sqrtpd; |
132 | } |
133 | else |
134 | { |
135 | unreached(); |
136 | } |
137 | break; |
138 | |
139 | case SIMDIntrinsicAdd: |
140 | if (baseType == TYP_FLOAT) |
141 | { |
142 | result = INS_addps; |
143 | } |
144 | else if (baseType == TYP_DOUBLE) |
145 | { |
146 | result = INS_addpd; |
147 | } |
148 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
149 | { |
150 | result = INS_paddd; |
151 | } |
152 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
153 | { |
154 | result = INS_paddw; |
155 | } |
156 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
157 | { |
158 | result = INS_paddb; |
159 | } |
160 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
161 | { |
162 | result = INS_paddq; |
163 | } |
164 | break; |
165 | |
166 | case SIMDIntrinsicSub: |
167 | if (baseType == TYP_FLOAT) |
168 | { |
169 | result = INS_subps; |
170 | } |
171 | else if (baseType == TYP_DOUBLE) |
172 | { |
173 | result = INS_subpd; |
174 | } |
175 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
176 | { |
177 | result = INS_psubd; |
178 | } |
179 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
180 | { |
181 | result = INS_psubw; |
182 | } |
183 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
184 | { |
185 | result = INS_psubb; |
186 | } |
187 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
188 | { |
189 | result = INS_psubq; |
190 | } |
191 | break; |
192 | |
193 | case SIMDIntrinsicMul: |
194 | if (baseType == TYP_FLOAT) |
195 | { |
196 | result = INS_mulps; |
197 | } |
198 | else if (baseType == TYP_DOUBLE) |
199 | { |
200 | result = INS_mulpd; |
201 | } |
202 | else if (baseType == TYP_SHORT) |
203 | { |
204 | result = INS_pmullw; |
205 | } |
206 | else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
207 | { |
208 | result = INS_pmulld; |
209 | } |
210 | break; |
211 | |
212 | case SIMDIntrinsicDiv: |
213 | if (baseType == TYP_FLOAT) |
214 | { |
215 | result = INS_divps; |
216 | } |
217 | else if (baseType == TYP_DOUBLE) |
218 | { |
219 | result = INS_divpd; |
220 | } |
221 | else |
222 | { |
223 | unreached(); |
224 | } |
225 | break; |
226 | |
227 | case SIMDIntrinsicMin: |
228 | if (baseType == TYP_FLOAT) |
229 | { |
230 | result = INS_minps; |
231 | } |
232 | else if (baseType == TYP_DOUBLE) |
233 | { |
234 | result = INS_minpd; |
235 | } |
236 | else if (baseType == TYP_UBYTE) |
237 | { |
238 | result = INS_pminub; |
239 | } |
240 | else if (baseType == TYP_SHORT) |
241 | { |
242 | result = INS_pminsw; |
243 | } |
244 | else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
245 | { |
246 | if (baseType == TYP_BYTE) |
247 | { |
248 | result = INS_pminsb; |
249 | } |
250 | else if (baseType == TYP_USHORT) |
251 | { |
252 | result = INS_pminuw; |
253 | } |
254 | else if (baseType == TYP_INT) |
255 | { |
256 | result = INS_pminsd; |
257 | } |
258 | else if (baseType == TYP_UINT) |
259 | { |
260 | result = INS_pminud; |
261 | } |
262 | } |
263 | else |
264 | { |
265 | unreached(); |
266 | } |
267 | break; |
268 | |
269 | case SIMDIntrinsicMax: |
270 | if (baseType == TYP_FLOAT) |
271 | { |
272 | result = INS_maxps; |
273 | } |
274 | else if (baseType == TYP_DOUBLE) |
275 | { |
276 | result = INS_maxpd; |
277 | } |
278 | else if (baseType == TYP_UBYTE) |
279 | { |
280 | result = INS_pmaxub; |
281 | } |
282 | else if (baseType == TYP_SHORT) |
283 | { |
284 | result = INS_pmaxsw; |
285 | } |
286 | else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
287 | { |
288 | if (baseType == TYP_BYTE) |
289 | { |
290 | result = INS_pmaxsb; |
291 | } |
292 | else if (baseType == TYP_USHORT) |
293 | { |
294 | result = INS_pmaxuw; |
295 | } |
296 | else if (baseType == TYP_INT) |
297 | { |
298 | result = INS_pmaxsd; |
299 | } |
300 | else if (baseType == TYP_UINT) |
301 | { |
302 | result = INS_pmaxud; |
303 | } |
304 | } |
305 | else |
306 | { |
307 | unreached(); |
308 | } |
309 | break; |
310 | |
311 | case SIMDIntrinsicAbs: |
312 | if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
313 | { |
314 | if (baseType == TYP_INT) |
315 | { |
316 | result = INS_pabsd; |
317 | } |
318 | else if (baseType == TYP_SHORT) |
319 | { |
320 | result = INS_pabsw; |
321 | } |
322 | else if (baseType == TYP_BYTE) |
323 | { |
324 | result = INS_pabsb; |
325 | } |
326 | } |
327 | break; |
328 | |
329 | case SIMDIntrinsicEqual: |
330 | if (baseType == TYP_FLOAT) |
331 | { |
332 | result = INS_cmpps; |
333 | assert(ival != nullptr); |
334 | *ival = 0; |
335 | } |
336 | else if (baseType == TYP_DOUBLE) |
337 | { |
338 | result = INS_cmppd; |
339 | assert(ival != nullptr); |
340 | *ival = 0; |
341 | } |
342 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
343 | { |
344 | result = INS_pcmpeqd; |
345 | } |
346 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
347 | { |
348 | result = INS_pcmpeqw; |
349 | } |
350 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
351 | { |
352 | result = INS_pcmpeqb; |
353 | } |
354 | else if ((baseType == TYP_ULONG || baseType == TYP_LONG) && |
355 | (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
356 | { |
357 | result = INS_pcmpeqq; |
358 | } |
359 | break; |
360 | |
361 | case SIMDIntrinsicLessThan: |
362 | // Packed integers use > with swapped operands |
363 | assert(baseType != TYP_INT); |
364 | |
365 | if (baseType == TYP_FLOAT) |
366 | { |
367 | result = INS_cmpps; |
368 | assert(ival != nullptr); |
369 | *ival = 1; |
370 | } |
371 | else if (baseType == TYP_DOUBLE) |
372 | { |
373 | result = INS_cmppd; |
374 | assert(ival != nullptr); |
375 | *ival = 1; |
376 | } |
377 | break; |
378 | |
379 | case SIMDIntrinsicLessThanOrEqual: |
380 | // Packed integers use (a==b) || ( b > a) in place of a <= b. |
381 | assert(baseType != TYP_INT); |
382 | |
383 | if (baseType == TYP_FLOAT) |
384 | { |
385 | result = INS_cmpps; |
386 | assert(ival != nullptr); |
387 | *ival = 2; |
388 | } |
389 | else if (baseType == TYP_DOUBLE) |
390 | { |
391 | result = INS_cmppd; |
392 | assert(ival != nullptr); |
393 | *ival = 2; |
394 | } |
395 | break; |
396 | |
397 | case SIMDIntrinsicGreaterThan: |
398 | // Packed float/double use < with swapped operands |
399 | assert(!varTypeIsFloating(baseType)); |
400 | |
401 | // SSE2 supports only signed > |
402 | if (baseType == TYP_INT) |
403 | { |
404 | result = INS_pcmpgtd; |
405 | } |
406 | else if (baseType == TYP_SHORT) |
407 | { |
408 | result = INS_pcmpgtw; |
409 | } |
410 | else if (baseType == TYP_BYTE) |
411 | { |
412 | result = INS_pcmpgtb; |
413 | } |
414 | else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
415 | { |
416 | result = INS_pcmpgtq; |
417 | } |
418 | break; |
419 | |
420 | case SIMDIntrinsicBitwiseAnd: |
421 | if (baseType == TYP_FLOAT) |
422 | { |
423 | result = INS_andps; |
424 | } |
425 | else if (baseType == TYP_DOUBLE) |
426 | { |
427 | result = INS_andpd; |
428 | } |
429 | else if (varTypeIsIntegral(baseType)) |
430 | { |
431 | result = INS_pand; |
432 | } |
433 | break; |
434 | |
435 | case SIMDIntrinsicBitwiseAndNot: |
436 | if (baseType == TYP_FLOAT) |
437 | { |
438 | result = INS_andnps; |
439 | } |
440 | else if (baseType == TYP_DOUBLE) |
441 | { |
442 | result = INS_andnpd; |
443 | } |
444 | else if (baseType == TYP_INT) |
445 | { |
446 | result = INS_pandn; |
447 | } |
448 | else if (varTypeIsIntegral(baseType)) |
449 | { |
450 | result = INS_pandn; |
451 | } |
452 | break; |
453 | |
454 | case SIMDIntrinsicBitwiseOr: |
455 | if (baseType == TYP_FLOAT) |
456 | { |
457 | result = INS_orps; |
458 | } |
459 | else if (baseType == TYP_DOUBLE) |
460 | { |
461 | result = INS_orpd; |
462 | } |
463 | else if (varTypeIsIntegral(baseType)) |
464 | { |
465 | result = INS_por; |
466 | } |
467 | break; |
468 | |
469 | case SIMDIntrinsicBitwiseXor: |
470 | if (baseType == TYP_FLOAT) |
471 | { |
472 | result = INS_xorps; |
473 | } |
474 | else if (baseType == TYP_DOUBLE) |
475 | { |
476 | result = INS_xorpd; |
477 | } |
478 | else if (varTypeIsIntegral(baseType)) |
479 | { |
480 | result = INS_pxor; |
481 | } |
482 | break; |
483 | |
484 | case SIMDIntrinsicCast: |
485 | result = INS_movaps; |
486 | break; |
487 | |
488 | case SIMDIntrinsicConvertToSingle: |
489 | result = INS_cvtdq2ps; |
490 | break; |
491 | |
492 | case SIMDIntrinsicConvertToDouble: |
493 | assert(baseType == TYP_LONG); |
494 | result = INS_cvtsi2sd; |
495 | break; |
496 | |
497 | case SIMDIntrinsicConvertToInt32: |
498 | assert(baseType == TYP_FLOAT); |
499 | result = INS_cvttps2dq; |
500 | break; |
501 | |
502 | case SIMDIntrinsicConvertToInt64: |
503 | assert(baseType == TYP_DOUBLE); |
504 | result = INS_cvttsd2si; |
505 | break; |
506 | |
507 | case SIMDIntrinsicNarrow: |
508 | // Note that for the integer types the caller must zero the upper bits of |
509 | // each source element, since the instructions saturate. |
510 | switch (baseType) |
511 | { |
512 | case TYP_INT: |
513 | case TYP_UINT: |
514 | if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
515 | { |
516 | result = INS_packusdw; |
517 | } |
518 | else |
519 | { |
520 | result = INS_packssdw; |
521 | } |
522 | break; |
523 | case TYP_SHORT: |
524 | case TYP_USHORT: |
525 | result = INS_packuswb; |
526 | break; |
527 | default: |
528 | assert(!"Invalid baseType for SIMDIntrinsicNarrow" ); |
529 | result = INS_invalid; |
530 | break; |
531 | } |
532 | break; |
533 | |
534 | case SIMDIntrinsicWidenLo: |
535 | // Some of these have multiple instruction implementations, with one instruction to widen the lo half, |
536 | // and another to widen the hi half. |
537 | switch (baseType) |
538 | { |
539 | case TYP_FLOAT: |
540 | result = INS_cvtps2pd; |
541 | break; |
542 | case TYP_INT: |
543 | case TYP_UINT: |
544 | result = INS_punpckldq; |
545 | break; |
546 | case TYP_SHORT: |
547 | case TYP_USHORT: |
548 | result = INS_punpcklwd; |
549 | break; |
550 | case TYP_BYTE: |
551 | case TYP_UBYTE: |
552 | result = INS_punpcklbw; |
553 | break; |
554 | default: |
555 | assert(!"Invalid baseType for SIMDIntrinsicWidenLo" ); |
556 | result = INS_invalid; |
557 | break; |
558 | } |
559 | break; |
560 | |
561 | case SIMDIntrinsicWidenHi: |
562 | switch (baseType) |
563 | { |
564 | case TYP_FLOAT: |
565 | // For this case, we actually use the same instruction. |
566 | result = INS_cvtps2pd; |
567 | break; |
568 | case TYP_INT: |
569 | case TYP_UINT: |
570 | result = INS_punpckhdq; |
571 | break; |
572 | case TYP_SHORT: |
573 | case TYP_USHORT: |
574 | result = INS_punpckhwd; |
575 | break; |
576 | case TYP_BYTE: |
577 | case TYP_UBYTE: |
578 | result = INS_punpckhbw; |
579 | break; |
580 | default: |
581 | assert(!"Invalid baseType for SIMDIntrinsicWidenHi" ); |
582 | result = INS_invalid; |
583 | break; |
584 | } |
585 | break; |
586 | |
587 | case SIMDIntrinsicShiftLeftInternal: |
588 | switch (baseType) |
589 | { |
590 | case TYP_SIMD16: |
591 | // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. |
592 | result = INS_pslldq; |
593 | break; |
594 | case TYP_UINT: |
595 | case TYP_INT: |
596 | result = INS_pslld; |
597 | break; |
598 | case TYP_SHORT: |
599 | case TYP_USHORT: |
600 | result = INS_psllw; |
601 | break; |
602 | default: |
603 | assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal" ); |
604 | result = INS_invalid; |
605 | break; |
606 | } |
607 | break; |
608 | |
609 | case SIMDIntrinsicShiftRightInternal: |
610 | switch (baseType) |
611 | { |
612 | case TYP_SIMD16: |
613 | // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. |
614 | result = INS_psrldq; |
615 | break; |
616 | case TYP_UINT: |
617 | case TYP_INT: |
618 | result = INS_psrld; |
619 | break; |
620 | case TYP_SHORT: |
621 | case TYP_USHORT: |
622 | result = INS_psrlw; |
623 | break; |
624 | default: |
625 | assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal" ); |
626 | result = INS_invalid; |
627 | break; |
628 | } |
629 | break; |
630 | |
631 | case SIMDIntrinsicUpperSave: |
632 | result = INS_vextractf128; |
633 | break; |
634 | |
635 | case SIMDIntrinsicUpperRestore: |
636 | result = INS_insertps; |
637 | break; |
638 | |
639 | default: |
640 | assert(!"Unsupported SIMD intrinsic" ); |
641 | unreached(); |
642 | } |
643 | |
644 | noway_assert(result != INS_invalid); |
645 | return result; |
646 | } |
647 | |
648 | // genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg |
649 | // to target mm reg, zeroing out the upper bits if and only if specified. |
650 | // |
651 | // Arguments: |
652 | // targetType the target type |
653 | // baseType the base type of value to be moved |
654 | // targetReg the target reg |
655 | // srcReg the src reg |
656 | // moveType action to be performed on target upper bits |
657 | // |
658 | // Return Value: |
659 | // None |
660 | // |
661 | // Notes: |
662 | // This is currently only supported for floating point types. |
663 | // |
664 | void CodeGen::genSIMDScalarMove( |
665 | var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType) |
666 | { |
667 | assert(varTypeIsFloating(baseType)); |
668 | switch (moveType) |
669 | { |
670 | case SMT_PreserveUpper: |
671 | if (srcReg != targetReg) |
672 | { |
673 | instruction ins = ins_Store(baseType); |
674 | if (getEmitter()->IsDstSrcSrcAVXInstruction(ins)) |
675 | { |
676 | // In general, when we use a three-operands move instruction, we want to merge the src with |
677 | // itself. This is an exception in that we actually want the "merge" behavior, so we must |
678 | // specify it with all 3 operands. |
679 | inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType)); |
680 | } |
681 | else |
682 | { |
683 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
684 | } |
685 | } |
686 | break; |
687 | |
688 | case SMT_ZeroInitUpper: |
689 | if (compiler->canUseVexEncoding()) |
690 | { |
691 | // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want. |
692 | // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose |
693 | // to zero all but the lower bits. |
694 | unsigned int insertpsImm = |
695 | (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3)); |
696 | inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm); |
697 | } |
698 | else |
699 | { |
700 | if (srcReg == targetReg) |
701 | { |
702 | // There is no guarantee that upper bits of op1Reg are zero. |
703 | // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes. |
704 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
705 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); |
706 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
707 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); |
708 | } |
709 | else |
710 | { |
711 | genSIMDZero(targetType, TYP_FLOAT, targetReg); |
712 | inst_RV_RV(ins_Store(baseType), targetReg, srcReg); |
713 | } |
714 | } |
715 | break; |
716 | |
717 | case SMT_ZeroInitUpper_SrcHasUpperZeros: |
718 | if (srcReg != targetReg) |
719 | { |
720 | instruction ins = ins_Copy(baseType); |
721 | assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins)); |
722 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
723 | } |
724 | break; |
725 | |
726 | default: |
727 | unreached(); |
728 | } |
729 | } |
730 | |
731 | void CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg) |
732 | { |
733 | // We just use `INS_xorps` instead of `getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType)` |
734 | // since `genSIMDZero` is used for both `System.Numerics.Vectors` and HardwareIntrinsics. Modern |
735 | // CPUs handle this specially in the renamer and it never hits the execution pipeline, additionally |
736 | // `INS_xorps` is always available (when using either the legacy or VEX encoding). |
737 | inst_RV_RV(INS_xorps, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
738 | } |
739 | |
740 | //------------------------------------------------------------------------ |
741 | // genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize. |
742 | // |
743 | // Arguments: |
744 | // simdNode - The GT_SIMD node |
745 | // |
746 | // Return Value: |
747 | // None. |
748 | // |
749 | void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode) |
750 | { |
751 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit); |
752 | |
753 | GenTree* op1 = simdNode->gtGetOp1(); |
754 | var_types baseType = simdNode->gtSIMDBaseType; |
755 | regNumber targetReg = simdNode->gtRegNum; |
756 | assert(targetReg != REG_NA); |
757 | var_types targetType = simdNode->TypeGet(); |
758 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
759 | unsigned size = simdNode->gtSIMDSize; |
760 | |
761 | // Should never see small int base type vectors except for zero initialization. |
762 | noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0)); |
763 | |
764 | instruction ins = INS_invalid; |
765 | |
766 | #if !defined(_TARGET_64BIT_) |
767 | if (op1->OperGet() == GT_LONG) |
768 | { |
769 | assert(varTypeIsLong(baseType)); |
770 | |
771 | GenTree* op1lo = op1->gtGetOp1(); |
772 | GenTree* op1hi = op1->gtGetOp2(); |
773 | |
774 | if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) |
775 | { |
776 | genSIMDZero(targetType, baseType, targetReg); |
777 | } |
778 | else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)) |
779 | { |
780 | // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg. |
781 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT); |
782 | inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
783 | } |
784 | else |
785 | { |
786 | // Generate: |
787 | // mov_i2xmm targetReg, op1lo |
788 | // mov_i2xmm xmmtmp, op1hi |
789 | // shl xmmtmp, 4 bytes |
790 | // por targetReg, xmmtmp |
791 | // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using: |
792 | // shufpd targetReg, targetReg, 0 // move the long to all the lanes |
793 | // For AVX2, move it to all 4 of the 64-bit lanes using: |
794 | // vpbroadcastq targetReg, targetReg |
795 | |
796 | instruction ins; |
797 | |
798 | regNumber op1loReg = genConsumeReg(op1lo); |
799 | ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT); |
800 | inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT)); |
801 | |
802 | regNumber tmpReg = simdNode->GetSingleTempReg(); |
803 | |
804 | regNumber op1hiReg = genConsumeReg(op1hi); |
805 | ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT); |
806 | inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT)); |
807 | |
808 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
809 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes |
810 | |
811 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType); |
812 | inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
813 | |
814 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
815 | { |
816 | inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32)); |
817 | } |
818 | else |
819 | { |
820 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType); |
821 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0); |
822 | } |
823 | } |
824 | } |
825 | else |
826 | #endif // !defined(_TARGET_64BIT_) |
827 | if (op1->isContained()) |
828 | { |
829 | if (op1->IsIntegralConst(0) || op1->IsFPZero()) |
830 | { |
831 | genSIMDZero(targetType, baseType, targetReg); |
832 | } |
833 | else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1)) |
834 | { |
835 | // case of initializing elements of vector with all 1's |
836 | // generate pcmpeqd reg, reg |
837 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT); |
838 | inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
839 | } |
840 | else |
841 | { |
842 | assert(level == SIMD_AVX2_Supported); |
843 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType); |
844 | if (op1->IsCnsFltOrDbl()) |
845 | { |
846 | getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1); |
847 | } |
848 | else if (op1->OperIsLocalAddr()) |
849 | { |
850 | unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0; |
851 | getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, |
852 | offset); |
853 | } |
854 | else |
855 | { |
856 | unreached(); |
857 | } |
858 | } |
859 | } |
860 | else if (level == SIMD_AVX2_Supported && ((size == 32) || (size == 16))) |
861 | { |
862 | regNumber srcReg = genConsumeReg(op1); |
863 | if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG) |
864 | { |
865 | ins = ins_CopyIntToFloat(baseType, TYP_FLOAT); |
866 | assert(ins != INS_invalid); |
867 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
868 | srcReg = targetReg; |
869 | } |
870 | |
871 | ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
872 | getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg); |
873 | } |
874 | else |
875 | { |
876 | // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType. |
877 | // In either case we are going to use the SSE2 shuffle instruction. |
878 | |
879 | regNumber op1Reg = genConsumeReg(op1); |
880 | unsigned shuffleControl = 0; |
881 | |
882 | if (compiler->isSubRegisterSIMDType(simdNode)) |
883 | { |
884 | assert(baseType == TYP_FLOAT); |
885 | |
886 | // We cannot assume that upper bits of op1Reg or targetReg be zero. |
887 | // Therefore we need to explicitly zero out upper bits. This is |
888 | // essential for the shuffle operation performed below. |
889 | // |
890 | // If op1 is a float/double constant, we would have loaded it from |
891 | // data section using movss/sd. Similarly if op1 is a memory op we |
892 | // would have loaded it using movss/sd. Movss/sd when loading a xmm reg |
893 | // from memory would zero-out upper bits. In these cases we can |
894 | // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently |
895 | // if they are not the same. |
896 | SIMDScalarMoveType moveType = |
897 | op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper; |
898 | |
899 | genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType); |
900 | |
901 | if (size == 8) |
902 | { |
903 | shuffleControl = 0x50; |
904 | } |
905 | else if (size == 12) |
906 | { |
907 | shuffleControl = 0x40; |
908 | } |
909 | else |
910 | { |
911 | noway_assert(!"Unexpected size for SIMD type" ); |
912 | } |
913 | } |
914 | else // Vector<T> |
915 | { |
916 | if (op1Reg != targetReg) |
917 | { |
918 | if (varTypeIsFloating(baseType)) |
919 | { |
920 | ins = ins_Copy(targetType); |
921 | } |
922 | else if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG) |
923 | { |
924 | ins = ins_CopyIntToFloat(baseType, TYP_FLOAT); |
925 | } |
926 | |
927 | assert(ins != INS_invalid); |
928 | inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType)); |
929 | } |
930 | } |
931 | |
932 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType); |
933 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl); |
934 | } |
935 | |
936 | genProduceReg(simdNode); |
937 | } |
938 | |
939 | //------------------------------------------------------------------------------------------- |
940 | // genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes |
941 | // a number of arguments equal to the length of the Vector. |
942 | // |
943 | // Arguments: |
944 | // simdNode - The GT_SIMD node |
945 | // |
946 | // Return Value: |
947 | // None. |
948 | // |
949 | void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode) |
950 | { |
951 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN); |
952 | |
953 | // Right now this intrinsic is supported only on TYP_FLOAT vectors |
954 | var_types baseType = simdNode->gtSIMDBaseType; |
955 | noway_assert(baseType == TYP_FLOAT); |
956 | |
957 | regNumber targetReg = simdNode->gtRegNum; |
958 | assert(targetReg != REG_NA); |
959 | |
960 | var_types targetType = simdNode->TypeGet(); |
961 | |
962 | // Note that we cannot use targetReg before consumed all source operands. Therefore, |
963 | // Need an internal register to stitch together all the values into a single vector |
964 | // in an XMM reg. |
965 | regNumber vectorReg = simdNode->GetSingleTempReg(); |
966 | |
967 | // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size. |
968 | // For example in case of Vector4f we don't need to zero when using SSE2. |
969 | if (compiler->isSubRegisterSIMDType(simdNode)) |
970 | { |
971 | genSIMDZero(targetType, baseType, vectorReg); |
972 | } |
973 | |
974 | unsigned int baseTypeSize = genTypeSize(baseType); |
975 | instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
976 | |
977 | // We will first consume the list items in execution (left to right) order, |
978 | // and record the registers. |
979 | regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT]; |
980 | unsigned initCount = 0; |
981 | for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2()) |
982 | { |
983 | assert(list->OperGet() == GT_LIST); |
984 | GenTree* listItem = list->gtGetOp1(); |
985 | assert(listItem->TypeGet() == baseType); |
986 | assert(!listItem->isContained()); |
987 | regNumber operandReg = genConsumeReg(listItem); |
988 | operandRegs[initCount] = operandReg; |
989 | initCount++; |
990 | } |
991 | |
992 | unsigned int offset = 0; |
993 | for (unsigned i = 0; i < initCount; i++) |
994 | { |
995 | // We will now construct the vector from the list items in reverse order. |
996 | // This allows us to efficiently stitch together a vector as follows: |
997 | // vectorReg = (vectorReg << offset) |
998 | // VectorReg[0] = listItemReg |
999 | // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper |
1000 | // bits of vectorReg are not modified. |
1001 | |
1002 | regNumber operandReg = operandRegs[initCount - i - 1]; |
1003 | if (offset != 0) |
1004 | { |
1005 | getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize); |
1006 | } |
1007 | genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper); |
1008 | |
1009 | offset += baseTypeSize; |
1010 | } |
1011 | |
1012 | noway_assert(offset == simdNode->gtSIMDSize); |
1013 | |
1014 | // Load the initialized value. |
1015 | if (targetReg != vectorReg) |
1016 | { |
1017 | inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType)); |
1018 | } |
1019 | genProduceReg(simdNode); |
1020 | } |
1021 | |
1022 | //---------------------------------------------------------------------------------- |
1023 | // genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt. |
1024 | // |
1025 | // Arguments: |
1026 | // simdNode - The GT_SIMD node |
1027 | // |
1028 | // Return Value: |
1029 | // None. |
1030 | // |
1031 | void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode) |
1032 | { |
1033 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast || |
1034 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAbs); |
1035 | |
1036 | GenTree* op1 = simdNode->gtGetOp1(); |
1037 | var_types baseType = simdNode->gtSIMDBaseType; |
1038 | regNumber targetReg = simdNode->gtRegNum; |
1039 | assert(targetReg != REG_NA); |
1040 | var_types targetType = simdNode->TypeGet(); |
1041 | |
1042 | regNumber op1Reg = genConsumeReg(op1); |
1043 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
1044 | if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg) |
1045 | { |
1046 | inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1047 | } |
1048 | genProduceReg(simdNode); |
1049 | } |
1050 | |
1051 | //---------------------------------------------------------------------------------- |
1052 | // genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float) |
1053 | // |
1054 | // Arguments: |
1055 | // simdNode - The GT_SIMD node |
1056 | // |
1057 | // Return Value: |
1058 | // None. |
1059 | // |
1060 | void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode) |
1061 | { |
1062 | SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; |
1063 | assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32)); |
1064 | |
1065 | GenTree* op1 = simdNode->gtGetOp1(); |
1066 | var_types baseType = simdNode->gtSIMDBaseType; |
1067 | regNumber targetReg = simdNode->gtRegNum; |
1068 | assert(targetReg != REG_NA); |
1069 | var_types targetType = simdNode->TypeGet(); |
1070 | |
1071 | regNumber op1Reg = genConsumeReg(op1); |
1072 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
1073 | if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT) |
1074 | { |
1075 | regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
1076 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1077 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1078 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg); |
1079 | |
1080 | // We will generate the following: |
1081 | // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) |
1082 | // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) |
1083 | // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg) |
1084 | // vpslld tmpReg2, 16 |
1085 | // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2) |
1086 | // mov tmpIntReg, 0x5300000053000000 |
1087 | // vmovd tmpReg, tmpIntReg |
1088 | // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src) |
1089 | // vorps targetReg, tmpReg |
1090 | // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg) |
1091 | // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2) |
1092 | // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits) |
1093 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType)); |
1094 | if (targetReg != op1Reg) |
1095 | { |
1096 | inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType)); |
1097 | } |
1098 | |
1099 | // prepare upper 16 bits |
1100 | getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16); |
1101 | |
1102 | // prepare lower 16 bits |
1103 | getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16); |
1104 | getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16); |
1105 | |
1106 | // prepare mask |
1107 | #ifdef _TARGET_AMD64_ |
1108 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000); |
1109 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
1110 | #else |
1111 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
1112 | { |
1113 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000); |
1114 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
1115 | } |
1116 | else |
1117 | { |
1118 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300); |
1119 | inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1120 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1); |
1121 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3); |
1122 | } |
1123 | #endif |
1124 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
1125 | { |
1126 | inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1127 | } |
1128 | else |
1129 | { |
1130 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1131 | } |
1132 | |
1133 | // convert upper 16 bits |
1134 | inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1135 | inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1136 | |
1137 | // convert lower 16 bits |
1138 | inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType)); |
1139 | |
1140 | // add lower 16 bits and upper 16 bits |
1141 | inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType)); |
1142 | } |
1143 | else |
1144 | { |
1145 | inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1146 | } |
1147 | genProduceReg(simdNode); |
1148 | } |
1149 | |
1150 | //---------------------------------------------------------------------------------- |
1151 | // genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double) |
1152 | // |
1153 | // Arguments: |
1154 | // intrinsicID the SIMD intrinsic ID |
1155 | // simdType the SIMD node type |
1156 | // baseType the base type of value to be converted |
1157 | // tmpReg the tmp reg |
1158 | // tmpIntReg the tmp integer reg |
1159 | // targetReg the target reg |
1160 | // |
1161 | // Return Value: |
1162 | // None. |
1163 | // |
1164 | void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID, |
1165 | var_types simdType, |
1166 | var_types baseType, |
1167 | regNumber tmpReg, |
1168 | regNumber tmpIntReg, |
1169 | regNumber targetReg) |
1170 | { |
1171 | instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType); |
1172 | if (intrinsicID == SIMDIntrinsicConvertToDouble) |
1173 | { |
1174 | // Note that for mov_xmm2i, the int register is always in the reg2 position |
1175 | inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG); |
1176 | inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType)); |
1177 | } |
1178 | else |
1179 | { |
1180 | inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType)); |
1181 | inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG); |
1182 | } |
1183 | } |
1184 | |
1185 | //---------------------------------------------------------------------------------- |
1186 | // genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double) |
1187 | // |
1188 | // Arguments: |
1189 | // simdNode - The GT_SIMD node |
1190 | // |
1191 | // Notes: |
1192 | // There are no instructions for converting to/from 64-bit integers, so for these we |
1193 | // do the conversion an element at a time. |
1194 | // |
1195 | void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode) |
1196 | { |
1197 | SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; |
1198 | assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64)); |
1199 | |
1200 | GenTree* op1 = simdNode->gtGetOp1(); |
1201 | var_types baseType = simdNode->gtSIMDBaseType; |
1202 | regNumber targetReg = simdNode->gtRegNum; |
1203 | assert(targetReg != REG_NA); |
1204 | var_types simdType = simdNode->TypeGet(); |
1205 | regNumber op1Reg = genConsumeReg(op1); |
1206 | regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
1207 | regNumber tmpReg; |
1208 | regNumber tmpReg2; |
1209 | regNumber tmpReg3; |
1210 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
1211 | |
1212 | #ifdef _TARGET_X86_ |
1213 | if (baseType == TYP_LONG) |
1214 | { |
1215 | tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1216 | tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1217 | tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1218 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg); |
1219 | } |
1220 | else |
1221 | #endif |
1222 | if (level == SIMD_AVX2_Supported || (baseType == TYP_ULONG)) |
1223 | { |
1224 | tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1225 | tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1226 | tmpReg3 = REG_NA; |
1227 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg); |
1228 | } |
1229 | else |
1230 | { |
1231 | tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1232 | assert(tmpReg != op1Reg); |
1233 | tmpReg2 = REG_NA; |
1234 | tmpReg3 = REG_NA; |
1235 | } |
1236 | |
1237 | if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG)) |
1238 | { |
1239 | // We will generate the following |
1240 | // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) |
1241 | // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) |
1242 | // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg) |
1243 | // vpsllq tmpReg2, 32 |
1244 | // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2) |
1245 | // mov tmpIntReg, 0x4530000000000000 |
1246 | // vmovd tmpReg, tmpIntReg |
1247 | // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src) |
1248 | // vorpd targetReg, tmpReg |
1249 | // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg) |
1250 | // mov tmpIntReg, 0x4330000000000000 |
1251 | // vmovd tmpReg, tmpIntReg |
1252 | // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src) |
1253 | // vorpd tmpReg2, tmpReg |
1254 | // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2) |
1255 | // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together) |
1256 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); |
1257 | if (targetReg != op1Reg) |
1258 | { |
1259 | inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType)); |
1260 | } |
1261 | |
1262 | // prepare upper 32 bits |
1263 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); |
1264 | |
1265 | // prepare lower 32 bits |
1266 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); |
1267 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); |
1268 | |
1269 | // prepare mask for converting upper 32 bits |
1270 | #ifdef _TARGET_AMD64_ |
1271 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000); |
1272 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
1273 | #else |
1274 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); |
1275 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
1276 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
1277 | #endif |
1278 | if (level == SIMD_AVX2_Supported) |
1279 | { |
1280 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1281 | } |
1282 | else |
1283 | { |
1284 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1285 | } |
1286 | |
1287 | // convert upper 32 bits |
1288 | inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1289 | inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1290 | |
1291 | // prepare mask for converting lower 32 bits |
1292 | #ifdef _TARGET_AMD64_ |
1293 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000); |
1294 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
1295 | #else |
1296 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); |
1297 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
1298 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
1299 | #endif |
1300 | if (level == SIMD_AVX2_Supported) |
1301 | { |
1302 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1303 | } |
1304 | else |
1305 | { |
1306 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1307 | } |
1308 | |
1309 | // convert lower 32 bits |
1310 | inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1311 | inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1312 | |
1313 | // add lower 32 bits and upper 32 bits |
1314 | inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); |
1315 | } |
1316 | else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG)) |
1317 | { |
1318 | #ifdef _TARGET_AMD64_ |
1319 | instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
1320 | instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
1321 | |
1322 | if (level == SIMD_AVX2_Supported) |
1323 | { |
1324 | // Extract the high 16-bits |
1325 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
1326 | |
1327 | // Put v[3] (the high-order element) in tmpReg2 and convert it. |
1328 | inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1329 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
1330 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); |
1331 | |
1332 | // Shift the resulting 64-bits left. |
1333 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
1334 | |
1335 | // Convert v[2], in the lo bits of tmpReg. |
1336 | // For the convert to double, the convert preserves the upper bits in tmpReg2. |
1337 | // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. |
1338 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2); |
1339 | } |
1340 | |
1341 | // Put v[1] in tmpReg. |
1342 | inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); |
1343 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
1344 | |
1345 | // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. |
1346 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
1347 | |
1348 | // Shift the resulting 64-bits left. |
1349 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
1350 | |
1351 | // Convert the lo 64-bits into targetReg |
1352 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg); |
1353 | |
1354 | // Merge or copy the results (only at this point are we done with op1Reg). |
1355 | if (tmpReg != targetReg) |
1356 | { |
1357 | inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1358 | } |
1359 | |
1360 | if (level == SIMD_AVX2_Supported) |
1361 | { |
1362 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01); |
1363 | } |
1364 | #else |
1365 | // get the sign bit and put it in tmpReg3 |
1366 | inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType)); |
1367 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63); |
1368 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63); |
1369 | |
1370 | // get the absolute value of src and put it into tmpReg2 and targetReg |
1371 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); |
1372 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY); |
1373 | getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32); |
1374 | inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); |
1375 | inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); |
1376 | inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType)); |
1377 | |
1378 | // prepare upper 32 bits |
1379 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); |
1380 | |
1381 | // prepare lower 32 bits |
1382 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); |
1383 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); |
1384 | |
1385 | // prepare mask for converting upper 32 bits |
1386 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); |
1387 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
1388 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
1389 | |
1390 | if (level == SIMD_AVX2_Supported) |
1391 | { |
1392 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1393 | } |
1394 | else |
1395 | { |
1396 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1397 | } |
1398 | |
1399 | // convert upper 32 bits |
1400 | inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1401 | inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1402 | |
1403 | // prepare mask for converting lower 32 bits |
1404 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); |
1405 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
1406 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
1407 | |
1408 | if (level == SIMD_AVX2_Supported) |
1409 | { |
1410 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1411 | } |
1412 | else |
1413 | { |
1414 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1415 | } |
1416 | |
1417 | // convert lower 32 bits |
1418 | inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1419 | inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1420 | |
1421 | // add lower 32 bits and upper 32 bits |
1422 | inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); |
1423 | |
1424 | // add sign bit |
1425 | inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType)); |
1426 | #endif |
1427 | } |
1428 | else |
1429 | { |
1430 | instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
1431 | instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
1432 | |
1433 | if (level == SIMD_AVX2_Supported) |
1434 | { |
1435 | // Extract the high 16-bits |
1436 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
1437 | |
1438 | // Put v[3] (the high-order element) in tmpReg2 and convert it. |
1439 | inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1440 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
1441 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); |
1442 | |
1443 | // Shift the resulting 64-bits left. |
1444 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
1445 | |
1446 | // Convert v[2], in the lo bits of tmpReg. |
1447 | // For the convert to double, the convert preserves the upper bits in tmpReg2. |
1448 | // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. |
1449 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
1450 | inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
1451 | } |
1452 | |
1453 | // Put v[1] in tmpReg. |
1454 | inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); |
1455 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
1456 | |
1457 | // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. |
1458 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
1459 | |
1460 | // Shift the resulting 64-bits left. |
1461 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
1462 | |
1463 | // Convert the lo 64-bits into targetReg |
1464 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg); |
1465 | |
1466 | // Merge or copy the results (only at this point are we done with op1Reg). |
1467 | assert(tmpReg != targetReg); |
1468 | inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
1469 | if (level == SIMD_AVX2_Supported) |
1470 | { |
1471 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01); |
1472 | } |
1473 | } |
1474 | genProduceReg(simdNode); |
1475 | } |
1476 | |
1477 | //-------------------------------------------------------------------------------- |
1478 | // genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register |
1479 | // |
1480 | // Arguments: |
1481 | // simdNode - The GT_SIMD node |
1482 | // |
1483 | // Notes: |
1484 | // This is used for the WidenHi intrinsic to extract the upper half. |
1485 | // On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes. |
1486 | // |
1487 | void CodeGen::(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg) |
1488 | { |
1489 | var_types simdType = simdNode->TypeGet(); |
1490 | emitAttr emitSize = emitActualTypeSize(simdType); |
1491 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
1492 | { |
1493 | instruction = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128; |
1494 | getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01); |
1495 | } |
1496 | else |
1497 | { |
1498 | instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
1499 | if (tgtReg != srcReg) |
1500 | { |
1501 | inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize); |
1502 | } |
1503 | getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8); |
1504 | } |
1505 | } |
1506 | |
1507 | //-------------------------------------------------------------------------------- |
1508 | // genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations |
1509 | // |
1510 | // Arguments: |
1511 | // simdNode - The GT_SIMD node |
1512 | // |
1513 | // Notes: |
1514 | // The Widen intrinsics are broken into separate intrinsics for the two results. |
1515 | // |
1516 | void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) |
1517 | { |
1518 | assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || |
1519 | (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); |
1520 | |
1521 | GenTree* op1 = simdNode->gtGetOp1(); |
1522 | var_types baseType = simdNode->gtSIMDBaseType; |
1523 | regNumber targetReg = simdNode->gtRegNum; |
1524 | assert(targetReg != REG_NA); |
1525 | var_types simdType = simdNode->TypeGet(); |
1526 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
1527 | |
1528 | genConsumeOperands(simdNode); |
1529 | regNumber op1Reg = op1->gtRegNum; |
1530 | regNumber srcReg = op1Reg; |
1531 | emitAttr emitSize = emitActualTypeSize(simdType); |
1532 | instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
1533 | |
1534 | if (baseType == TYP_FLOAT) |
1535 | { |
1536 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) |
1537 | { |
1538 | genSIMDExtractUpperHalf(simdNode, srcReg, targetReg); |
1539 | srcReg = targetReg; |
1540 | } |
1541 | inst_RV_RV(widenIns, targetReg, srcReg, simdType); |
1542 | } |
1543 | else |
1544 | { |
1545 | // We will generate the following on AVX: |
1546 | // vpermq targetReg, op1Reg, 0xd4|0xe8 |
1547 | // vpxor tmpReg, tmpReg |
1548 | // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed) |
1549 | // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg |
1550 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1551 | assert(tmpReg != op1Reg); |
1552 | |
1553 | if (level == SIMD_AVX2_Supported) |
1554 | { |
1555 | // permute op1Reg and put it into targetReg |
1556 | unsigned ival = 0xd4; |
1557 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) |
1558 | { |
1559 | ival = 0xe8; |
1560 | } |
1561 | getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival); |
1562 | } |
1563 | else if (targetReg != op1Reg) |
1564 | { |
1565 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); |
1566 | } |
1567 | |
1568 | genSIMDZero(simdType, baseType, tmpReg); |
1569 | if (!varTypeIsUnsigned(baseType)) |
1570 | { |
1571 | instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType); |
1572 | inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize); |
1573 | } |
1574 | inst_RV_RV(widenIns, targetReg, tmpReg, simdType); |
1575 | } |
1576 | genProduceReg(simdNode); |
1577 | } |
1578 | |
1579 | //-------------------------------------------------------------------------------- |
1580 | // genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations |
1581 | // |
1582 | // Arguments: |
1583 | // simdNode - The GT_SIMD node |
1584 | // |
1585 | // Notes: |
1586 | // This intrinsic takes two arguments. The first operand is narrowed to produce the |
1587 | // lower elements of the results, and the second operand produces the high elements. |
1588 | // |
1589 | void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) |
1590 | { |
1591 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); |
1592 | |
1593 | GenTree* op1 = simdNode->gtGetOp1(); |
1594 | GenTree* op2 = simdNode->gtGetOp2(); |
1595 | var_types baseType = simdNode->gtSIMDBaseType; |
1596 | regNumber targetReg = simdNode->gtRegNum; |
1597 | assert(targetReg != REG_NA); |
1598 | var_types simdType = simdNode->TypeGet(); |
1599 | emitAttr emitSize = emitTypeSize(simdType); |
1600 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
1601 | |
1602 | genConsumeOperands(simdNode); |
1603 | regNumber op1Reg = op1->gtRegNum; |
1604 | regNumber op2Reg = op2->gtRegNum; |
1605 | if (baseType == TYP_DOUBLE) |
1606 | { |
1607 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1608 | |
1609 | inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); |
1610 | inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); |
1611 | // Now insert the high-order result (in tmpReg) into the upper half of targetReg. |
1612 | if (level == SIMD_AVX2_Supported) |
1613 | { |
1614 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); |
1615 | } |
1616 | else |
1617 | { |
1618 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX); |
1619 | } |
1620 | } |
1621 | else if (varTypeIsLong(baseType)) |
1622 | { |
1623 | if (level == SIMD_AVX2_Supported) |
1624 | { |
1625 | // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg. |
1626 | // We will generate the following: |
1627 | // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg) |
1628 | // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2) |
1629 | // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg) |
1630 | // mov tmpReg2, op1Reg |
1631 | // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2) |
1632 | // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg |
1633 | // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg |
1634 | // punpcklqdq tgtReg, tmpReg |
1635 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1636 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1637 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
1638 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); |
1639 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01); |
1640 | inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize); |
1641 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); |
1642 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX); |
1643 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX); |
1644 | inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize); |
1645 | } |
1646 | else |
1647 | { |
1648 | // We will generate the following: |
1649 | // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements) |
1650 | // psrldq targetReg, 8 (shift them right to get zeros in the high elements) |
1651 | // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements) |
1652 | // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements) |
1653 | // por targetReg, tmpReg |
1654 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1655 | instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
1656 | instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
1657 | emitAttr emitSize = emitTypeSize(simdType); |
1658 | |
1659 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX); |
1660 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8); |
1661 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX); |
1662 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8); |
1663 | inst_RV_RV(INS_por, targetReg, tmpReg, simdType); |
1664 | } |
1665 | } |
1666 | else |
1667 | { |
1668 | // We will generate the following: |
1669 | // mov targetReg, op1Reg |
1670 | // mov tmpReg, op2Reg |
1671 | // psll? targetReg, shiftCount |
1672 | // pslr? targetReg, shiftCount |
1673 | // psll? tmpReg, shiftCount |
1674 | // pslr? tmpReg, shiftCount |
1675 | // <pack> targetReg, tmpReg |
1676 | // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType), |
1677 | // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to |
1678 | // get CLR type semantics; otherwise it will saturate). |
1679 | // |
1680 | int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2); |
1681 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
1682 | instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); |
1683 | instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); |
1684 | |
1685 | if (level == SIMD_AVX2_Supported) |
1686 | { |
1687 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
1688 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1689 | |
1690 | // The AVX instructions generally operate on "lanes", so we have to permute the |
1691 | // inputs so that the destination register has the low 128-bit halves of the two |
1692 | // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs. |
1693 | getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20); |
1694 | getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31); |
1695 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount); |
1696 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount); |
1697 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); |
1698 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount); |
1699 | inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType)); |
1700 | } |
1701 | else |
1702 | { |
1703 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
1704 | |
1705 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); |
1706 | inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize); |
1707 | |
1708 | instruction tmpShiftRight = shiftRightIns; |
1709 | if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported) |
1710 | { |
1711 | tmpShiftRight = INS_psrad; |
1712 | } |
1713 | |
1714 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount); |
1715 | getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount); |
1716 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); |
1717 | getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount); |
1718 | inst_RV_RV(ins, targetReg, tmpReg, simdType); |
1719 | } |
1720 | } |
1721 | genProduceReg(simdNode); |
1722 | } |
1723 | |
1724 | //-------------------------------------------------------------------------------- |
1725 | // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations |
1726 | // add, sub, mul, bit-wise And, AndNot and Or. |
1727 | // |
1728 | // Arguments: |
1729 | // simdNode - The GT_SIMD node |
1730 | // |
1731 | // Return Value: |
1732 | // None. |
1733 | // |
1734 | void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) |
1735 | { |
1736 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || |
1737 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv || |
1738 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || |
1739 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot || |
1740 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr || |
1741 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin || |
1742 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax); |
1743 | |
1744 | GenTree* op1 = simdNode->gtGetOp1(); |
1745 | GenTree* op2 = simdNode->gtGetOp2(); |
1746 | var_types baseType = simdNode->gtSIMDBaseType; |
1747 | regNumber targetReg = simdNode->gtRegNum; |
1748 | assert(targetReg != REG_NA); |
1749 | var_types targetType = simdNode->TypeGet(); |
1750 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
1751 | |
1752 | genConsumeOperands(simdNode); |
1753 | regNumber op1Reg = op1->gtRegNum; |
1754 | regNumber op2Reg = op2->gtRegNum; |
1755 | regNumber otherReg = op2Reg; |
1756 | |
1757 | // Vector<Int>.Mul: |
1758 | // SSE2 doesn't have an instruction to perform this operation directly |
1759 | // whereas SSE4.1 does (pmulld). This is special cased and computed |
1760 | // as follows. |
1761 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported) |
1762 | { |
1763 | // We need a temporary register that is NOT the same as the target, |
1764 | // and we MAY need another. |
1765 | regNumber tmpReg = simdNode->ExtractTempReg(); |
1766 | regNumber tmpReg2 = simdNode->GetSingleTempReg(); |
1767 | |
1768 | // The register allocator guarantees the following conditions: |
1769 | // - the only registers that may be the same among op1Reg, op2Reg, tmpReg |
1770 | // and tmpReg2 are op1Reg and op2Reg. |
1771 | // Let's be extra-careful and assert that now. |
1772 | assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) && |
1773 | (tmpReg != tmpReg2)); |
1774 | |
1775 | // We will start by setting things up so that: |
1776 | // - We have op1 in op1Reg and targetReg, and they are different registers. |
1777 | // - We have op2 in op2Reg and tmpReg |
1778 | // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified, |
1779 | // OR they are the targetReg that will be produced. |
1780 | // (Note that in the code we generate below op1Reg and op2Reg are never written.) |
1781 | // We will copy things as necessary to ensure that this is the case. |
1782 | // Note that we can swap op1 and op2, since multiplication is commutative. |
1783 | // We will not modify the values in op1Reg and op2Reg. |
1784 | // (Though note that if either op1 or op2 is the same as targetReg, we will make |
1785 | // a copy and use that copy as the input register. In that case we WILL modify |
1786 | // the original value in the register, but will wind up with the result in targetReg |
1787 | // in the end, as expected.) |
1788 | |
1789 | // First, we need a tmpReg that is NOT the same as targetReg. |
1790 | // Note that if we have another reg that is the same as targetReg, |
1791 | // we can use tmpReg2 for that case, as we will not have hit this case. |
1792 | if (tmpReg == targetReg) |
1793 | { |
1794 | tmpReg = tmpReg2; |
1795 | } |
1796 | |
1797 | if (op2Reg == targetReg) |
1798 | { |
1799 | // We will swap the operands. |
1800 | // Since the code below only deals with registers, this now becomes the case where |
1801 | // op1Reg == targetReg. |
1802 | op2Reg = op1Reg; |
1803 | op1Reg = targetReg; |
1804 | } |
1805 | if (op1Reg == targetReg) |
1806 | { |
1807 | // Copy op1, and make tmpReg2 the new op1Reg. |
1808 | // Note that those regs can't be the same, as we asserted above. |
1809 | // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit |
1810 | // the "tmpReg == targetReg" case. |
1811 | inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType)); |
1812 | op1Reg = tmpReg2; |
1813 | inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
1814 | // However, we have one more case to worry about: what if op2Reg is also targetReg |
1815 | // (i.e. we have the same operand as op1 and op2)? |
1816 | // In that case we will set op2Reg to the same register as op1Reg. |
1817 | if (op2Reg == targetReg) |
1818 | { |
1819 | op2Reg = tmpReg2; |
1820 | } |
1821 | } |
1822 | else |
1823 | { |
1824 | // Copy op1 to targetReg and op2 to tmpReg. |
1825 | inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1826 | inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
1827 | } |
1828 | // Let's assert that things are as we expect. |
1829 | // - We have op1 in op1Reg and targetReg, and they are different registers. |
1830 | assert(op1Reg != targetReg); |
1831 | // - We have op2 in op2Reg and tmpReg, and they are different registers. |
1832 | assert(op2Reg != tmpReg); |
1833 | // - Either we are going to leave op1's reg unmodified, or it is the targetReg. |
1834 | assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg)); |
1835 | // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg. |
1836 | assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg)); |
1837 | |
1838 | // Now we can generate the code. |
1839 | |
1840 | // targetReg = op1 >> 4-bytes (op1 is already in targetReg) |
1841 | getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4); |
1842 | |
1843 | // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg) |
1844 | getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4); |
1845 | |
1846 | // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially |
1847 | // tmpReg[63:0] = op1[1] * op2[1] |
1848 | // tmpReg[127:64] = op1[3] * op2[3] |
1849 | inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType)); |
1850 | |
1851 | // Extract first and third double word results from tmpReg |
1852 | // tmpReg = shuffle(0,0,2,0) of tmpReg |
1853 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX); |
1854 | |
1855 | // targetReg[63:0] = op1[0] * op2[0] |
1856 | // targetReg[127:64] = op1[2] * op2[2] |
1857 | inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1858 | inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
1859 | |
1860 | // Extract first and third double word results from targetReg |
1861 | // targetReg = shuffle(0,0,2,0) of targetReg |
1862 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX); |
1863 | |
1864 | // pack the results into a single vector |
1865 | inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
1866 | } |
1867 | else |
1868 | { |
1869 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
1870 | |
1871 | // Currently AVX doesn't support integer. |
1872 | // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. |
1873 | if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && |
1874 | !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins)) |
1875 | { |
1876 | inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); |
1877 | } |
1878 | else |
1879 | { |
1880 | if (op2Reg == targetReg) |
1881 | { |
1882 | otherReg = op1Reg; |
1883 | } |
1884 | else if (op1Reg != targetReg) |
1885 | { |
1886 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1887 | } |
1888 | |
1889 | inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); |
1890 | } |
1891 | } |
1892 | |
1893 | // Vector2/3 div: since the top-most elements will be zero, we end up |
1894 | // perfoming 0/0 which is a NAN. Therefore, post division we need to set the |
1895 | // top-most elements to zero. This is achieved by left logical shift followed |
1896 | // by right logical shift of targetReg. |
1897 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16)) |
1898 | { |
1899 | // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length. |
1900 | unsigned shiftCount = 16 - simdNode->gtSIMDSize; |
1901 | assert(shiftCount != 0); |
1902 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
1903 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); |
1904 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
1905 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); |
1906 | } |
1907 | |
1908 | genProduceReg(simdNode); |
1909 | } |
1910 | |
1911 | //-------------------------------------------------------------------------------- |
1912 | // genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater |
1913 | // <, <=, >, >= and == |
1914 | // |
1915 | // Arguments: |
1916 | // simdNode - The GT_SIMD node |
1917 | // |
1918 | // Return Value: |
1919 | // None. |
1920 | // |
1921 | void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) |
1922 | { |
1923 | GenTree* op1 = simdNode->gtGetOp1(); |
1924 | GenTree* op2 = simdNode->gtGetOp2(); |
1925 | var_types baseType = simdNode->gtSIMDBaseType; |
1926 | regNumber targetReg = simdNode->gtRegNum; |
1927 | var_types targetType = simdNode->TypeGet(); |
1928 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
1929 | |
1930 | genConsumeOperands(simdNode); |
1931 | regNumber op1Reg = op1->gtRegNum; |
1932 | regNumber op2Reg = op2->gtRegNum; |
1933 | regNumber otherReg = op2Reg; |
1934 | |
1935 | switch (simdNode->gtSIMDIntrinsicID) |
1936 | { |
1937 | case SIMDIntrinsicEqual: |
1938 | case SIMDIntrinsicGreaterThan: |
1939 | { |
1940 | assert(targetReg != REG_NA); |
1941 | |
1942 | #ifdef DEBUG |
1943 | // SSE2: vector<(u)long> relational op should be implemented in terms of |
1944 | // TYP_INT comparison operations |
1945 | if (baseType == TYP_LONG || baseType == TYP_ULONG) |
1946 | { |
1947 | assert(level >= SIMD_SSE4_Supported); |
1948 | } |
1949 | #endif |
1950 | |
1951 | // Greater-than: Floating point vectors use "<" with swapped operands |
1952 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan) |
1953 | { |
1954 | assert(!varTypeIsFloating(baseType)); |
1955 | } |
1956 | |
1957 | unsigned ival = 0; |
1958 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival); |
1959 | |
1960 | // targetReg = op1reg > op2reg |
1961 | // Therefore, we can optimize if op1Reg == targetReg |
1962 | otherReg = op2Reg; |
1963 | if (op1Reg != targetReg) |
1964 | { |
1965 | if (op2Reg == targetReg) |
1966 | { |
1967 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual); |
1968 | otherReg = op1Reg; |
1969 | } |
1970 | else |
1971 | { |
1972 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
1973 | } |
1974 | } |
1975 | |
1976 | if (varTypeIsFloating(baseType)) |
1977 | { |
1978 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival); |
1979 | } |
1980 | else |
1981 | { |
1982 | inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); |
1983 | } |
1984 | } |
1985 | break; |
1986 | |
1987 | case SIMDIntrinsicLessThan: |
1988 | case SIMDIntrinsicLessThanOrEqual: |
1989 | { |
1990 | assert(targetReg != REG_NA); |
1991 | |
1992 | // Int vectors use ">" and ">=" with swapped operands |
1993 | assert(varTypeIsFloating(baseType)); |
1994 | |
1995 | // Get the instruction opcode for compare operation |
1996 | unsigned ival; |
1997 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival); |
1998 | |
1999 | // targetReg = op1reg RelOp op2reg |
2000 | // Thefore, we can optimize if op1Reg == targetReg |
2001 | if (op1Reg != targetReg) |
2002 | { |
2003 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
2004 | } |
2005 | |
2006 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival); |
2007 | } |
2008 | break; |
2009 | |
2010 | // (In)Equality that produces bool result instead of a bit vector |
2011 | case SIMDIntrinsicOpEquality: |
2012 | case SIMDIntrinsicOpInEquality: |
2013 | { |
2014 | // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC. |
2015 | assert(targetReg == REG_NA); |
2016 | |
2017 | var_types simdType = op1->TypeGet(); |
2018 | // TODO-1stClassStructs: Temporary to minimize asmDiffs |
2019 | if (simdType == TYP_DOUBLE) |
2020 | { |
2021 | simdType = TYP_SIMD8; |
2022 | } |
2023 | |
2024 | // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16 |
2025 | // since both the operands will be in XMM registers. |
2026 | if (simdType == TYP_SIMD12) |
2027 | { |
2028 | simdType = TYP_SIMD16; |
2029 | } |
2030 | |
2031 | // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest. |
2032 | if (op2->isContained()) |
2033 | { |
2034 | assert((compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0)); |
2035 | inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); |
2036 | } |
2037 | else |
2038 | { |
2039 | // We need one additional SIMD register to store the result of the SIMD compare. |
2040 | regNumber tmpReg1 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
2041 | |
2042 | // tmpReg1 = (op1Reg == op2Reg) |
2043 | // Call this value of tmpReg1 as 'compResult' for further reference below. |
2044 | regNumber otherReg = op2Reg; |
2045 | if (tmpReg1 != op2Reg) |
2046 | { |
2047 | if (tmpReg1 != op1Reg) |
2048 | { |
2049 | inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); |
2050 | } |
2051 | } |
2052 | else |
2053 | { |
2054 | otherReg = op1Reg; |
2055 | } |
2056 | |
2057 | // For all integer types we can use TYP_INT comparison. |
2058 | unsigned ival = 0; |
2059 | instruction ins = |
2060 | getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); |
2061 | |
2062 | if (varTypeIsFloating(baseType)) |
2063 | { |
2064 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); |
2065 | } |
2066 | else |
2067 | { |
2068 | inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); |
2069 | } |
2070 | |
2071 | regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
2072 | inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); |
2073 | // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare |
2074 | // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a |
2075 | // subset of each component's ones/zeroes. In the end we need to know if the result is |
2076 | // "all ones" where the number of ones is given by the vector byte size, not by the |
2077 | // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and |
2078 | // for SSE registers we need to compare to 0x0000FFFF. |
2079 | // The SIMD12 case is handled specially, because we can't rely on the upper bytes being |
2080 | // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF). |
2081 | // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize |
2082 | // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF |
2083 | // encoding instead of 83F8FF. |
2084 | ssize_t mask; |
2085 | if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) |
2086 | { |
2087 | mask = 0x00000FFF; |
2088 | getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask); |
2089 | } |
2090 | else if (emitActualTypeSize(simdType) == 32) |
2091 | { |
2092 | mask = -1; |
2093 | } |
2094 | else |
2095 | { |
2096 | mask = 0x0000FFFF; |
2097 | } |
2098 | getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask); |
2099 | } |
2100 | } |
2101 | break; |
2102 | |
2103 | default: |
2104 | noway_assert(!"Unimplemented SIMD relational operation." ); |
2105 | unreached(); |
2106 | } |
2107 | |
2108 | genProduceReg(simdNode); |
2109 | } |
2110 | |
2111 | //-------------------------------------------------------------------------------- |
2112 | // genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product. |
2113 | // |
2114 | // Arguments: |
2115 | // simdNode - The GT_SIMD node |
2116 | // |
2117 | // Return Value: |
2118 | // None. |
2119 | // |
2120 | void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) |
2121 | { |
2122 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct); |
2123 | |
2124 | GenTree* op1 = simdNode->gtGetOp1(); |
2125 | GenTree* op2 = simdNode->gtGetOp2(); |
2126 | var_types baseType = simdNode->gtSIMDBaseType; |
2127 | var_types simdType = op1->TypeGet(); |
2128 | // TODO-1stClassStructs: Temporary to minimize asmDiffs |
2129 | if (simdType == TYP_DOUBLE) |
2130 | { |
2131 | simdType = TYP_SIMD8; |
2132 | } |
2133 | var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType; |
2134 | regNumber targetReg = simdNode->gtRegNum; |
2135 | assert(targetReg != REG_NA); |
2136 | |
2137 | var_types targetType = simdNode->TypeGet(); |
2138 | assert(targetType == baseType); |
2139 | |
2140 | genConsumeOperands(simdNode); |
2141 | regNumber op1Reg = op1->gtRegNum; |
2142 | regNumber op2Reg = op2->gtRegNum; |
2143 | regNumber tmpReg1 = REG_NA; |
2144 | regNumber tmpReg2 = REG_NA; |
2145 | |
2146 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
2147 | |
2148 | // Dot product intrinsic is supported only on float/double vectors |
2149 | // and 32-byte int vectors on AVX. |
2150 | // |
2151 | // Float/Double Vectors: |
2152 | // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register |
2153 | // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or |
2154 | // smaller on AVX, then we don't need a tmpReg. |
2155 | // |
2156 | // 32-byte integer vector on AVX: we need two additional Xmm registers |
2157 | // different from targetReg as scratch. |
2158 | // |
2159 | // 16-byte integer vector on SSE4: we need one additional Xmm register |
2160 | // different from targetReg as scratch. |
2161 | if (varTypeIsFloating(baseType)) |
2162 | { |
2163 | if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32)) |
2164 | { |
2165 | tmpReg1 = simdNode->GetSingleTempReg(); |
2166 | assert(tmpReg1 != targetReg); |
2167 | } |
2168 | else |
2169 | { |
2170 | assert(simdNode->AvailableTempRegCount() == 0); |
2171 | } |
2172 | } |
2173 | else |
2174 | { |
2175 | assert(baseType == TYP_INT); |
2176 | assert(level >= SIMD_SSE4_Supported); |
2177 | |
2178 | if (level == SIMD_SSE4_Supported) |
2179 | { |
2180 | tmpReg1 = simdNode->GetSingleTempReg(); |
2181 | } |
2182 | else |
2183 | { |
2184 | tmpReg1 = simdNode->ExtractTempReg(); |
2185 | tmpReg2 = simdNode->GetSingleTempReg(); |
2186 | } |
2187 | } |
2188 | |
2189 | if (level == SIMD_SSE2_Supported) |
2190 | { |
2191 | // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg |
2192 | if (op1Reg == targetReg) |
2193 | { |
2194 | // Best case |
2195 | // nothing to do, we have registers in the right place |
2196 | } |
2197 | else if (op2Reg == targetReg) |
2198 | { |
2199 | op2Reg = op1Reg; |
2200 | } |
2201 | else |
2202 | { |
2203 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); |
2204 | } |
2205 | |
2206 | // DotProduct(v1, v2) |
2207 | // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1 |
2208 | if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) |
2209 | { |
2210 | assert(baseType == TYP_FLOAT); |
2211 | // v0 = v1 * v2 |
2212 | // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its |
2213 | // // position |
2214 | // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper |
2215 | // // bits |
2216 | // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1) |
2217 | // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2) |
2218 | // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2) |
2219 | // |
2220 | inst_RV_RV(INS_mulps, targetReg, op2Reg); |
2221 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
2222 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY); |
2223 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
2224 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW); |
2225 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
2226 | } |
2227 | else if (baseType == TYP_FLOAT) |
2228 | { |
2229 | // v0 = v1 * v2 |
2230 | // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its |
2231 | // // position |
2232 | // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1) |
2233 | // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1) |
2234 | // tmp = v0 |
2235 | // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2) |
2236 | // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3) |
2237 | // // Essentially horizontal addition of all elements. |
2238 | // // We could achieve the same using SSEv3 instruction |
2239 | // // HADDPS. |
2240 | // |
2241 | inst_RV_RV(INS_mulps, targetReg, op2Reg); |
2242 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
2243 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY); |
2244 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
2245 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
2246 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW); |
2247 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
2248 | } |
2249 | else |
2250 | { |
2251 | assert(baseType == TYP_DOUBLE); |
2252 | |
2253 | // v0 = v1 * v2 |
2254 | // tmp = v0 // v0 = (1, 0) - each element is given by its position |
2255 | // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1) |
2256 | // v0 = v0 + tmp // v0 = (1+0, 0+1) |
2257 | inst_RV_RV(INS_mulpd, targetReg, op2Reg); |
2258 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
2259 | inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01); |
2260 | inst_RV_RV(INS_addpd, targetReg, tmpReg1); |
2261 | } |
2262 | } |
2263 | else |
2264 | { |
2265 | assert(level >= SIMD_SSE4_Supported); |
2266 | |
2267 | if (varTypeIsFloating(baseType)) |
2268 | { |
2269 | // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg. |
2270 | // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually |
2271 | // use the 3-op form, so that we can avoid these copies. |
2272 | // TODO-CQ: Add inst_RV_RV_RV_IV(). |
2273 | if (op1Reg == targetReg) |
2274 | { |
2275 | // Best case |
2276 | // nothing to do, we have registers in the right place |
2277 | } |
2278 | else if (op2Reg == targetReg) |
2279 | { |
2280 | op2Reg = op1Reg; |
2281 | } |
2282 | else |
2283 | { |
2284 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); |
2285 | } |
2286 | |
2287 | emitAttr emitSize = emitActualTypeSize(simdEvalType); |
2288 | if (baseType == TYP_FLOAT) |
2289 | { |
2290 | // dpps computes the dot product of the upper & lower halves of the 32-byte register. |
2291 | // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. |
2292 | unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1; |
2293 | inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask); |
2294 | // dpps computes the dot product of the upper & lower halves of the 32-byte register. |
2295 | // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. |
2296 | // If this is TYP_SIMD32, we need to combine the lower & upper results. |
2297 | if (simdEvalType == TYP_SIMD32) |
2298 | { |
2299 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); |
2300 | inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); |
2301 | } |
2302 | } |
2303 | else if (baseType == TYP_DOUBLE) |
2304 | { |
2305 | if (simdEvalType == TYP_SIMD32) |
2306 | { |
2307 | // targetReg = targetReg * op2Reg |
2308 | // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves |
2309 | // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg |
2310 | // targetReg = targetReg + tmpReg1 |
2311 | inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType)); |
2312 | inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType)); |
2313 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); |
2314 | inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); |
2315 | } |
2316 | else |
2317 | { |
2318 | // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use |
2319 | // dppd directly. |
2320 | assert(level == SIMD_SSE4_Supported); |
2321 | inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31); |
2322 | } |
2323 | } |
2324 | } |
2325 | else |
2326 | { |
2327 | // Dot product of 32-byte int vector on SSE4/AVX. |
2328 | assert(baseType == TYP_INT); |
2329 | assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32); |
2330 | |
2331 | #ifdef DEBUG |
2332 | // SSE4: We need 1 scratch register. |
2333 | // AVX2: We need 2 scratch registers. |
2334 | if (simdEvalType == TYP_SIMD16) |
2335 | { |
2336 | assert(tmpReg1 != REG_NA); |
2337 | } |
2338 | else |
2339 | { |
2340 | assert(tmpReg1 != REG_NA); |
2341 | assert(tmpReg2 != REG_NA); |
2342 | } |
2343 | #endif |
2344 | |
2345 | // tmpReg1 = op1 * op2 |
2346 | if (level == SIMD_AVX2_Supported) |
2347 | { |
2348 | // On AVX take advantage 3 operand form of pmulld |
2349 | inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType)); |
2350 | } |
2351 | else |
2352 | { |
2353 | inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType); |
2354 | inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType); |
2355 | } |
2356 | |
2357 | if (simdEvalType == TYP_SIMD32) |
2358 | { |
2359 | // tmpReg2[127..0] = Upper 128-bits of tmpReg1 |
2360 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); |
2361 | |
2362 | // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0] |
2363 | // This will compute |
2364 | // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4] |
2365 | // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5] |
2366 | // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6] |
2367 | // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7] |
2368 | inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE); |
2369 | } |
2370 | |
2371 | // This horizontal add will compute |
2372 | // |
2373 | // TYP_SIMD16: |
2374 | // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1] |
2375 | // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4] |
2376 | // |
2377 | // TYP_SIMD32: |
2378 | // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5] |
2379 | // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7] |
2380 | inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); |
2381 | |
2382 | // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1] |
2383 | inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); |
2384 | |
2385 | // TargetReg = integer result from tmpReg1 |
2386 | // (Note that for mov_xmm2i, the int register is always in the reg2 position) |
2387 | inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); |
2388 | } |
2389 | } |
2390 | |
2391 | genProduceReg(simdNode); |
2392 | } |
2393 | |
2394 | //------------------------------------------------------------------------------------ |
2395 | // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. |
2396 | // |
2397 | // Arguments: |
2398 | // simdNode - The GT_SIMD node |
2399 | // |
2400 | // Return Value: |
2401 | // None. |
2402 | // |
2403 | void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) |
2404 | { |
2405 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem); |
2406 | |
2407 | GenTree* op1 = simdNode->gtGetOp1(); |
2408 | GenTree* op2 = simdNode->gtGetOp2(); |
2409 | var_types simdType = op1->TypeGet(); |
2410 | assert(varTypeIsSIMD(simdType)); |
2411 | |
2412 | // op1 of TYP_SIMD12 should be considered as TYP_SIMD16, |
2413 | // since it is in XMM register. |
2414 | if (simdType == TYP_SIMD12) |
2415 | { |
2416 | simdType = TYP_SIMD16; |
2417 | } |
2418 | |
2419 | var_types baseType = simdNode->gtSIMDBaseType; |
2420 | regNumber targetReg = simdNode->gtRegNum; |
2421 | assert(targetReg != REG_NA); |
2422 | var_types targetType = simdNode->TypeGet(); |
2423 | assert(targetType == genActualType(baseType)); |
2424 | |
2425 | // GetItem has 2 operands: |
2426 | // - the source of SIMD type (op1) |
2427 | // - the index of the value to be returned. |
2428 | genConsumeOperands(simdNode); |
2429 | regNumber srcReg = op1->gtRegNum; |
2430 | |
2431 | // Optimize the case of op1 is in memory and trying to access ith element. |
2432 | if (!op1->isUsedFromReg()) |
2433 | { |
2434 | assert(op1->isContained()); |
2435 | |
2436 | regNumber baseReg; |
2437 | regNumber indexReg; |
2438 | int offset = 0; |
2439 | |
2440 | if (op1->OperIsLocal()) |
2441 | { |
2442 | // There are three parts to the total offset here: |
2443 | // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}. |
2444 | bool isEBPbased; |
2445 | unsigned varNum = op1->gtLclVarCommon.gtLclNum; |
2446 | offset += compiler->lvaFrameAddress(varNum, &isEBPbased); |
2447 | if (op1->OperGet() == GT_LCL_FLD) |
2448 | { |
2449 | offset += op1->gtLclFld.gtLclOffs; |
2450 | } |
2451 | baseReg = (isEBPbased) ? REG_EBP : REG_ESP; |
2452 | } |
2453 | else |
2454 | { |
2455 | // Require GT_IND addr to be not contained. |
2456 | assert(op1->OperGet() == GT_IND); |
2457 | |
2458 | GenTree* addr = op1->AsIndir()->Addr(); |
2459 | assert(!addr->isContained()); |
2460 | baseReg = addr->gtRegNum; |
2461 | } |
2462 | |
2463 | if (op2->isContainedIntOrIImmed()) |
2464 | { |
2465 | indexReg = REG_NA; |
2466 | offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType); |
2467 | } |
2468 | else |
2469 | { |
2470 | indexReg = op2->gtRegNum; |
2471 | assert(genIsValidIntReg(indexReg)); |
2472 | } |
2473 | |
2474 | // Now, load the desired element. |
2475 | getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load |
2476 | emitTypeSize(baseType), // Of the vector baseType |
2477 | targetReg, // To targetReg |
2478 | baseReg, // Base Reg |
2479 | indexReg, // Indexed |
2480 | genTypeSize(baseType), // by the size of the baseType |
2481 | offset); |
2482 | genProduceReg(simdNode); |
2483 | return; |
2484 | } |
2485 | |
2486 | // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant. |
2487 | // For the non-constant case, we will use the SIMD temp location to store the vector, and |
2488 | // the load the desired element. |
2489 | // The range check will already have been performed, so at this point we know we have an index |
2490 | // within the bounds of the vector. |
2491 | if (!op2->IsCnsIntOrI()) |
2492 | { |
2493 | unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; |
2494 | noway_assert(simdInitTempVarNum != BAD_VAR_NUM); |
2495 | bool isEBPbased; |
2496 | unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased); |
2497 | regNumber indexReg = op2->gtRegNum; |
2498 | |
2499 | // Store the vector to the temp location. |
2500 | getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)), |
2501 | emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0); |
2502 | |
2503 | // Now, load the desired element. |
2504 | getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load |
2505 | emitTypeSize(baseType), // Of the vector baseType |
2506 | targetReg, // To targetReg |
2507 | (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based |
2508 | indexReg, // Indexed |
2509 | genTypeSize(baseType), // by the size of the baseType |
2510 | offs); |
2511 | genProduceReg(simdNode); |
2512 | return; |
2513 | } |
2514 | |
2515 | noway_assert(op2->isContained()); |
2516 | noway_assert(op2->IsCnsIntOrI()); |
2517 | unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal; |
2518 | unsigned int byteShiftCnt = index * genTypeSize(baseType); |
2519 | |
2520 | // In general we shouldn't have an index greater than or equal to the length of the vector. |
2521 | // However, if we have an out-of-range access, under minOpts it will not be optimized |
2522 | // away. The code will throw before we reach this point, but we still need to generate |
2523 | // code. In that case, we will simply mask off the upper bits. |
2524 | if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength()) |
2525 | { |
2526 | byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1); |
2527 | index = byteShiftCnt / genTypeSize(baseType); |
2528 | } |
2529 | |
2530 | regNumber tmpReg = REG_NA; |
2531 | if (simdNode->AvailableTempRegCount() != 0) |
2532 | { |
2533 | tmpReg = simdNode->GetSingleTempReg(); |
2534 | } |
2535 | else |
2536 | { |
2537 | assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) || |
2538 | (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16))); |
2539 | } |
2540 | |
2541 | if (byteShiftCnt >= 16) |
2542 | { |
2543 | assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); |
2544 | byteShiftCnt -= 16; |
2545 | regNumber newSrcReg; |
2546 | if (varTypeIsFloating(baseType)) |
2547 | { |
2548 | newSrcReg = targetReg; |
2549 | } |
2550 | else |
2551 | { |
2552 | // Integer types |
2553 | assert(tmpReg != REG_NA); |
2554 | newSrcReg = tmpReg; |
2555 | } |
2556 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01); |
2557 | |
2558 | srcReg = newSrcReg; |
2559 | } |
2560 | |
2561 | // Generate the following sequence: |
2562 | // 1) baseType is floating point |
2563 | // movaps targetReg, srcReg |
2564 | // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element |
2565 | // |
2566 | // 2) baseType is not floating point |
2567 | // movaps tmpReg, srcReg <-- not generated if accessing zero'th element |
2568 | // OR if tmpReg == srcReg |
2569 | // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element |
2570 | // mov_xmm2i targetReg, tmpReg |
2571 | if (varTypeIsFloating(baseType)) |
2572 | { |
2573 | if (targetReg != srcReg) |
2574 | { |
2575 | inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType)); |
2576 | } |
2577 | |
2578 | if (byteShiftCnt != 0) |
2579 | { |
2580 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
2581 | getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt); |
2582 | } |
2583 | } |
2584 | else |
2585 | { |
2586 | if (varTypeIsSmallInt(baseType)) |
2587 | { |
2588 | // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits. |
2589 | // In case of vector<short> we also need to sign extend the 16-bit value in targetReg |
2590 | // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right |
2591 | // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg. |
2592 | |
2593 | unsigned baseSize = genTypeSize(baseType); |
2594 | if (baseSize == 1) |
2595 | { |
2596 | index /= 2; |
2597 | } |
2598 | // We actually want index % 8 for the AVX case (for SSE it will never be > 8). |
2599 | // Note that this doesn't matter functionally, because the instruction uses just the |
2600 | // low 3 bits of index, but it's better to use the right value. |
2601 | if (index > 8) |
2602 | { |
2603 | assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); |
2604 | index -= 8; |
2605 | } |
2606 | |
2607 | getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index); |
2608 | |
2609 | bool ZeroOrSignExtnReqd = true; |
2610 | if (baseSize == 1) |
2611 | { |
2612 | if ((op2->gtIntCon.gtIconVal % 2) == 1) |
2613 | { |
2614 | // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element. |
2615 | inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8); |
2616 | |
2617 | // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE |
2618 | ZeroOrSignExtnReqd = (baseType == TYP_BYTE); |
2619 | } |
2620 | // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits |
2621 | } |
2622 | else |
2623 | { |
2624 | // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT |
2625 | assert(baseSize == 2); |
2626 | ZeroOrSignExtnReqd = (baseType == TYP_SHORT); |
2627 | } |
2628 | |
2629 | if (ZeroOrSignExtnReqd) |
2630 | { |
2631 | // Zero/sign extend the byte/short to 32-bits |
2632 | inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType)); |
2633 | } |
2634 | } |
2635 | else |
2636 | { |
2637 | // We need a temp xmm register if the baseType is not floating point and |
2638 | // accessing non-zero'th element. |
2639 | instruction ins; |
2640 | |
2641 | if (byteShiftCnt != 0) |
2642 | { |
2643 | assert(tmpReg != REG_NA); |
2644 | |
2645 | if (tmpReg != srcReg) |
2646 | { |
2647 | inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType)); |
2648 | } |
2649 | |
2650 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
2651 | getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt); |
2652 | } |
2653 | else |
2654 | { |
2655 | tmpReg = srcReg; |
2656 | } |
2657 | |
2658 | assert(tmpReg != REG_NA); |
2659 | ins = ins_CopyFloatToInt(TYP_FLOAT, baseType); |
2660 | // (Note that for mov_xmm2i, the int register is always in the reg2 position.) |
2661 | inst_RV_RV(ins, tmpReg, targetReg, baseType); |
2662 | } |
2663 | } |
2664 | |
2665 | genProduceReg(simdNode); |
2666 | } |
2667 | |
2668 | //------------------------------------------------------------------------------------ |
2669 | // genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i. |
2670 | // |
2671 | // Arguments: |
2672 | // simdNode - The GT_SIMD node |
2673 | // |
2674 | // Return Value: |
2675 | // None. |
2676 | // |
2677 | // TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case. |
2678 | // |
2679 | void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode) |
2680 | { |
2681 | // Determine index based on intrinsic ID |
2682 | int index = -1; |
2683 | switch (simdNode->gtSIMDIntrinsicID) |
2684 | { |
2685 | case SIMDIntrinsicSetX: |
2686 | index = 0; |
2687 | break; |
2688 | case SIMDIntrinsicSetY: |
2689 | index = 1; |
2690 | break; |
2691 | case SIMDIntrinsicSetZ: |
2692 | index = 2; |
2693 | break; |
2694 | case SIMDIntrinsicSetW: |
2695 | index = 3; |
2696 | break; |
2697 | |
2698 | default: |
2699 | unreached(); |
2700 | } |
2701 | assert(index != -1); |
2702 | |
2703 | // op1 is the SIMD vector |
2704 | // op2 is the value to be set |
2705 | GenTree* op1 = simdNode->gtGetOp1(); |
2706 | GenTree* op2 = simdNode->gtGetOp2(); |
2707 | |
2708 | var_types baseType = simdNode->gtSIMDBaseType; |
2709 | regNumber targetReg = simdNode->gtRegNum; |
2710 | assert(targetReg != REG_NA); |
2711 | var_types targetType = simdNode->TypeGet(); |
2712 | assert(varTypeIsSIMD(targetType)); |
2713 | |
2714 | // the following assert must hold. |
2715 | // supported only on vector2f/3f/4f right now |
2716 | noway_assert(baseType == TYP_FLOAT); |
2717 | assert(op2->TypeGet() == baseType); |
2718 | assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType))); |
2719 | |
2720 | genConsumeOperands(simdNode); |
2721 | regNumber op1Reg = op1->gtRegNum; |
2722 | regNumber op2Reg = op2->gtRegNum; |
2723 | |
2724 | // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate. |
2725 | if (targetReg != op1Reg) |
2726 | { |
2727 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
2728 | } |
2729 | |
2730 | // Right now this intrinsic is supported only for float base type vectors. |
2731 | // If in future need to support on other base type vectors, the below |
2732 | // logic needs modification. |
2733 | noway_assert(baseType == TYP_FLOAT); |
2734 | |
2735 | if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) |
2736 | { |
2737 | // We need one additional int register as scratch |
2738 | regNumber tmpReg = simdNode->GetSingleTempReg(); |
2739 | assert(genIsValidIntReg(tmpReg)); |
2740 | |
2741 | // Move the value from xmm reg to an int reg |
2742 | instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT); |
2743 | // (Note that for mov_xmm2i, the int register is always in the reg2 position. |
2744 | inst_RV_RV(ins, op2Reg, tmpReg, baseType); |
2745 | |
2746 | // First insert the lower 16-bits of tmpReg in targetReg at 2*index position |
2747 | // since every float has two 16-bit words. |
2748 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index); |
2749 | |
2750 | // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position |
2751 | inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16); |
2752 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1); |
2753 | } |
2754 | else |
2755 | { |
2756 | unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index)); |
2757 | inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm); |
2758 | } |
2759 | |
2760 | genProduceReg(simdNode); |
2761 | } |
2762 | |
2763 | //------------------------------------------------------------------------ |
2764 | // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle. |
2765 | // |
2766 | // Arguments: |
2767 | // simdNode - The GT_SIMD node |
2768 | // |
2769 | // Return Value: |
2770 | // None. |
2771 | // |
2772 | void CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode) |
2773 | { |
2774 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2); |
2775 | noway_assert(compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported); |
2776 | |
2777 | GenTree* op1 = simdNode->gtGetOp1(); |
2778 | GenTree* op2 = simdNode->gtGetOp2(); |
2779 | assert(op2->isContained()); |
2780 | assert(op2->IsCnsIntOrI()); |
2781 | int shuffleControl = (int)op2->AsIntConCommon()->IconValue(); |
2782 | var_types baseType = simdNode->gtSIMDBaseType; |
2783 | var_types targetType = simdNode->TypeGet(); |
2784 | regNumber targetReg = simdNode->gtRegNum; |
2785 | assert(targetReg != REG_NA); |
2786 | |
2787 | regNumber op1Reg = genConsumeReg(op1); |
2788 | if (targetReg != op1Reg) |
2789 | { |
2790 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
2791 | } |
2792 | |
2793 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
2794 | getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl); |
2795 | genProduceReg(simdNode); |
2796 | } |
2797 | |
2798 | //----------------------------------------------------------------------------- |
2799 | // genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory. |
2800 | // Since Vector3 is not a hardware supported write size, it is performed |
2801 | // as two writes: 8 byte followed by 4-byte. |
2802 | // |
2803 | // Arguments: |
2804 | // treeNode - tree node that is attempting to store indirect |
2805 | // |
2806 | // |
2807 | // Return Value: |
2808 | // None. |
2809 | // |
2810 | void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode) |
2811 | { |
2812 | assert(treeNode->OperGet() == GT_STOREIND); |
2813 | |
2814 | GenTree* addr = treeNode->gtOp.gtOp1; |
2815 | GenTree* data = treeNode->gtOp.gtOp2; |
2816 | |
2817 | // addr and data should not be contained. |
2818 | assert(!data->isContained()); |
2819 | assert(!addr->isContained()); |
2820 | |
2821 | #ifdef DEBUG |
2822 | // Should not require a write barrier |
2823 | GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data); |
2824 | assert(writeBarrierForm == GCInfo::WBF_NoBarrier); |
2825 | #endif |
2826 | |
2827 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
2828 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
2829 | |
2830 | genConsumeOperands(treeNode->AsOp()); |
2831 | |
2832 | // 8-byte write |
2833 | getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0); |
2834 | |
2835 | // Extract upper 4-bytes from data |
2836 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02); |
2837 | |
2838 | // 4-byte write |
2839 | getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8); |
2840 | } |
2841 | |
2842 | //----------------------------------------------------------------------------- |
2843 | // genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value. |
2844 | // Since Vector3 is not a hardware supported write size, it is performed |
2845 | // as two loads: 8 byte followed by 4-byte. |
2846 | // |
2847 | // Arguments: |
2848 | // treeNode - tree node of GT_IND |
2849 | // |
2850 | // |
2851 | // Return Value: |
2852 | // None. |
2853 | // |
2854 | void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode) |
2855 | { |
2856 | assert(treeNode->OperGet() == GT_IND); |
2857 | |
2858 | regNumber targetReg = treeNode->gtRegNum; |
2859 | GenTree* op1 = treeNode->gtOp.gtOp1; |
2860 | assert(!op1->isContained()); |
2861 | regNumber operandReg = genConsumeReg(op1); |
2862 | |
2863 | // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg |
2864 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
2865 | assert(tmpReg != targetReg); |
2866 | |
2867 | // Load upper 4 bytes in tmpReg |
2868 | getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8); |
2869 | |
2870 | // Load lower 8 bytes in targetReg |
2871 | getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0); |
2872 | |
2873 | // combine upper 4 bytes and lower 8 bytes in targetReg |
2874 | getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); |
2875 | |
2876 | genProduceReg(treeNode); |
2877 | } |
2878 | |
2879 | //----------------------------------------------------------------------------- |
2880 | // genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field. |
2881 | // Since Vector3 is not a hardware supported write size, it is performed |
2882 | // as two stores: 8 byte followed by 4-byte. |
2883 | // |
2884 | // Arguments: |
2885 | // treeNode - tree node that is attempting to store TYP_SIMD12 field |
2886 | // |
2887 | // Return Value: |
2888 | // None. |
2889 | // |
2890 | void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode) |
2891 | { |
2892 | assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR)); |
2893 | |
2894 | unsigned offs = 0; |
2895 | unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; |
2896 | assert(varNum < compiler->lvaCount); |
2897 | |
2898 | if (treeNode->OperGet() == GT_LCL_FLD) |
2899 | { |
2900 | offs = treeNode->gtLclFld.gtLclOffs; |
2901 | } |
2902 | |
2903 | GenTree* op1 = treeNode->gtOp.gtOp1; |
2904 | assert(!op1->isContained()); |
2905 | regNumber operandReg = genConsumeReg(op1); |
2906 | |
2907 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
2908 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
2909 | |
2910 | // store lower 8 bytes |
2911 | getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs); |
2912 | |
2913 | // Extract upper 4-bytes from operandReg |
2914 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02); |
2915 | |
2916 | // Store upper 4 bytes |
2917 | getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs + 8); |
2918 | } |
2919 | |
2920 | //----------------------------------------------------------------------------- |
2921 | // genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field. |
2922 | // Since Vector3 is not a hardware supported read size, it is performed |
2923 | // as two reads: 4 byte followed by 8 byte. |
2924 | // |
2925 | // Arguments: |
2926 | // treeNode - tree node that is attempting to load TYP_SIMD12 field |
2927 | // |
2928 | // Return Value: |
2929 | // None. |
2930 | // |
2931 | void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode) |
2932 | { |
2933 | assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR)); |
2934 | |
2935 | regNumber targetReg = treeNode->gtRegNum; |
2936 | unsigned offs = 0; |
2937 | unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; |
2938 | assert(varNum < compiler->lvaCount); |
2939 | |
2940 | if (treeNode->OperGet() == GT_LCL_FLD) |
2941 | { |
2942 | offs = treeNode->gtLclFld.gtLclOffs; |
2943 | } |
2944 | |
2945 | // Need an additional Xmm register that is different from targetReg to read upper 4 bytes. |
2946 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
2947 | assert(tmpReg != targetReg); |
2948 | |
2949 | // Read upper 4 bytes to tmpReg |
2950 | getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs + 8); |
2951 | |
2952 | // Read lower 8 bytes to targetReg |
2953 | getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs); |
2954 | |
2955 | // combine upper 4 bytes and lower 8 bytes in targetReg |
2956 | getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); |
2957 | |
2958 | genProduceReg(treeNode); |
2959 | } |
2960 | |
2961 | #ifdef _TARGET_X86_ |
2962 | |
2963 | //----------------------------------------------------------------------------- |
2964 | // genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack. |
2965 | // Since Vector3 is not a hardware supported write size, it is performed |
2966 | // as two stores: 8 byte followed by 4-byte. The stack is assumed to have |
2967 | // already been adjusted. |
2968 | // |
2969 | // Arguments: |
2970 | // operandReg - the xmm register containing the SIMD12 to store. |
2971 | // tmpReg - an xmm register that can be used as a temporary for the operation. |
2972 | // |
2973 | // Return Value: |
2974 | // None. |
2975 | // |
2976 | void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg) |
2977 | { |
2978 | assert(genIsValidFloatReg(operandReg)); |
2979 | assert(genIsValidFloatReg(tmpReg)); |
2980 | |
2981 | // 8-byte write |
2982 | getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0); |
2983 | |
2984 | // Extract upper 4-bytes from data |
2985 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02); |
2986 | |
2987 | // 4-byte write |
2988 | getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8); |
2989 | } |
2990 | |
2991 | //----------------------------------------------------------------------------- |
2992 | // genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field. |
2993 | // Since Vector3 is not a hardware supported write size, it is performed |
2994 | // as two stores: 8 byte followed by 4-byte. The stack is assumed to have |
2995 | // already been adjusted. |
2996 | // |
2997 | // Arguments: |
2998 | // treeNode - tree node that is attempting to store TYP_SIMD12 field |
2999 | // |
3000 | // Return Value: |
3001 | // None. |
3002 | // |
3003 | void CodeGen::genPutArgStkSIMD12(GenTree* treeNode) |
3004 | { |
3005 | assert(treeNode->OperGet() == GT_PUTARG_STK); |
3006 | |
3007 | GenTree* op1 = treeNode->gtOp.gtOp1; |
3008 | assert(!op1->isContained()); |
3009 | regNumber operandReg = genConsumeReg(op1); |
3010 | |
3011 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
3012 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
3013 | |
3014 | genStoreSIMD12ToStack(operandReg, tmpReg); |
3015 | } |
3016 | |
3017 | #endif // _TARGET_X86_ |
3018 | |
3019 | //----------------------------------------------------------------------------- |
3020 | // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to |
3021 | // the given register, if any, or to memory. |
3022 | // |
3023 | // Arguments: |
3024 | // simdNode - The GT_SIMD node |
3025 | // |
3026 | // Return Value: |
3027 | // None. |
3028 | // |
3029 | // Notes: |
3030 | // The upper half of all AVX registers is volatile, even the callee-save registers. |
3031 | // When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic |
3032 | // to cause the upper half to be saved. It will first attempt to find another, unused, callee-save |
3033 | // register. If such a register cannot be found, it will save it to an available caller-save register. |
3034 | // In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte |
3035 | // value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte |
3036 | // value will be spilled to the stack.) |
3037 | // |
3038 | void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode) |
3039 | { |
3040 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave); |
3041 | |
3042 | GenTree* op1 = simdNode->gtGetOp1(); |
3043 | assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); |
3044 | regNumber targetReg = simdNode->gtRegNum; |
3045 | regNumber op1Reg = genConsumeReg(op1); |
3046 | assert(op1Reg != REG_NA); |
3047 | assert(targetReg != REG_NA); |
3048 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01); |
3049 | |
3050 | genProduceReg(simdNode); |
3051 | } |
3052 | |
3053 | //----------------------------------------------------------------------------- |
3054 | // genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to |
3055 | // the given register, if any, or to memory. |
3056 | // |
3057 | // Arguments: |
3058 | // simdNode - The GT_SIMD node |
3059 | // |
3060 | // Return Value: |
3061 | // None. |
3062 | // |
3063 | // Notes: |
3064 | // For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always |
3065 | // have their home register, this node has its targetReg on the lclVar child, and its source |
3066 | // on the simdNode. |
3067 | // Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled |
3068 | // an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike |
3069 | // most spill scenarios, the saved tree will be different from the restored tree, but the spill |
3070 | // restore logic, which is triggered by the call to genConsumeReg, requires us to provide the |
3071 | // spilled tree (saveNode) in order to perform the reload. We can easily find that tree, |
3072 | // as it is in the spill descriptor for the register from which it was saved. |
3073 | // |
3074 | void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode) |
3075 | { |
3076 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore); |
3077 | |
3078 | GenTree* op1 = simdNode->gtGetOp1(); |
3079 | assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); |
3080 | regNumber srcReg = simdNode->gtRegNum; |
3081 | regNumber lclVarReg = genConsumeReg(op1); |
3082 | unsigned varNum = op1->AsLclVarCommon()->gtLclNum; |
3083 | assert(lclVarReg != REG_NA); |
3084 | assert(srcReg != REG_NA); |
3085 | if (simdNode->gtFlags & GTF_SPILLED) |
3086 | { |
3087 | GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree; |
3088 | noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg)); |
3089 | genConsumeReg(saveNode); |
3090 | } |
3091 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01); |
3092 | } |
3093 | |
3094 | //------------------------------------------------------------------------ |
3095 | // genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main |
3096 | // routine which in turn calls appropriate genSIMDIntrinsicXXX() routine. |
3097 | // |
3098 | // Arguments: |
3099 | // simdNode - The GT_SIMD node |
3100 | // |
3101 | // Return Value: |
3102 | // None. |
3103 | // |
3104 | // Notes: |
3105 | // Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and |
3106 | // a limited set of methods. |
3107 | // |
3108 | void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) |
3109 | { |
3110 | // NYI for unsupported base types |
3111 | if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG && |
3112 | simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE && |
3113 | simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE && |
3114 | simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE && |
3115 | simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG) |
3116 | { |
3117 | noway_assert(!"SIMD intrinsic with unsupported base type." ); |
3118 | } |
3119 | |
3120 | switch (simdNode->gtSIMDIntrinsicID) |
3121 | { |
3122 | case SIMDIntrinsicInit: |
3123 | genSIMDIntrinsicInit(simdNode); |
3124 | break; |
3125 | |
3126 | case SIMDIntrinsicInitN: |
3127 | genSIMDIntrinsicInitN(simdNode); |
3128 | break; |
3129 | |
3130 | case SIMDIntrinsicSqrt: |
3131 | case SIMDIntrinsicCast: |
3132 | case SIMDIntrinsicAbs: |
3133 | genSIMDIntrinsicUnOp(simdNode); |
3134 | break; |
3135 | |
3136 | case SIMDIntrinsicConvertToSingle: |
3137 | case SIMDIntrinsicConvertToInt32: |
3138 | genSIMDIntrinsic32BitConvert(simdNode); |
3139 | break; |
3140 | |
3141 | case SIMDIntrinsicConvertToDouble: |
3142 | case SIMDIntrinsicConvertToInt64: |
3143 | genSIMDIntrinsic64BitConvert(simdNode); |
3144 | break; |
3145 | |
3146 | case SIMDIntrinsicWidenLo: |
3147 | case SIMDIntrinsicWidenHi: |
3148 | genSIMDIntrinsicWiden(simdNode); |
3149 | break; |
3150 | |
3151 | case SIMDIntrinsicNarrow: |
3152 | genSIMDIntrinsicNarrow(simdNode); |
3153 | break; |
3154 | |
3155 | case SIMDIntrinsicAdd: |
3156 | case SIMDIntrinsicSub: |
3157 | case SIMDIntrinsicMul: |
3158 | case SIMDIntrinsicDiv: |
3159 | case SIMDIntrinsicBitwiseAnd: |
3160 | case SIMDIntrinsicBitwiseAndNot: |
3161 | case SIMDIntrinsicBitwiseOr: |
3162 | case SIMDIntrinsicBitwiseXor: |
3163 | case SIMDIntrinsicMin: |
3164 | case SIMDIntrinsicMax: |
3165 | genSIMDIntrinsicBinOp(simdNode); |
3166 | break; |
3167 | |
3168 | case SIMDIntrinsicOpEquality: |
3169 | case SIMDIntrinsicOpInEquality: |
3170 | case SIMDIntrinsicEqual: |
3171 | case SIMDIntrinsicLessThan: |
3172 | case SIMDIntrinsicGreaterThan: |
3173 | case SIMDIntrinsicLessThanOrEqual: |
3174 | case SIMDIntrinsicGreaterThanOrEqual: |
3175 | genSIMDIntrinsicRelOp(simdNode); |
3176 | break; |
3177 | |
3178 | case SIMDIntrinsicDotProduct: |
3179 | genSIMDIntrinsicDotProduct(simdNode); |
3180 | break; |
3181 | |
3182 | case SIMDIntrinsicGetItem: |
3183 | genSIMDIntrinsicGetItem(simdNode); |
3184 | break; |
3185 | |
3186 | case SIMDIntrinsicShuffleSSE2: |
3187 | genSIMDIntrinsicShuffleSSE2(simdNode); |
3188 | break; |
3189 | |
3190 | case SIMDIntrinsicSetX: |
3191 | case SIMDIntrinsicSetY: |
3192 | case SIMDIntrinsicSetZ: |
3193 | case SIMDIntrinsicSetW: |
3194 | genSIMDIntrinsicSetItem(simdNode); |
3195 | break; |
3196 | |
3197 | case SIMDIntrinsicUpperSave: |
3198 | genSIMDIntrinsicUpperSave(simdNode); |
3199 | break; |
3200 | case SIMDIntrinsicUpperRestore: |
3201 | genSIMDIntrinsicUpperRestore(simdNode); |
3202 | break; |
3203 | |
3204 | default: |
3205 | noway_assert(!"Unimplemented SIMD intrinsic." ); |
3206 | unreached(); |
3207 | } |
3208 | } |
3209 | |
3210 | #endif // FEATURE_SIMD |
3211 | #endif //_TARGET_XARCH_ |
3212 | |