| 1 | // Licensed to the .NET Foundation under one or more agreements. |
| 2 | // The .NET Foundation licenses this file to you under the MIT license. |
| 3 | // See the LICENSE file in the project root for more information. |
| 4 | |
| 5 | /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
| 6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
| 7 | XX XX |
| 8 | XX Amd64 SIMD Code Generator XX |
| 9 | XX XX |
| 10 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
| 11 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX |
| 12 | */ |
| 13 | #include "jitpch.h" |
| 14 | #ifdef _MSC_VER |
| 15 | #pragma hdrstop |
| 16 | #endif |
| 17 | |
| 18 | #ifdef _TARGET_XARCH_ |
| 19 | #ifdef FEATURE_SIMD |
| 20 | |
| 21 | #include "emit.h" |
| 22 | #include "codegen.h" |
| 23 | #include "sideeffects.h" |
| 24 | #include "lower.h" |
| 25 | #include "gcinfo.h" |
| 26 | #include "gcinfoencoder.h" |
| 27 | |
| 28 | // Instruction immediates |
| 29 | |
| 30 | // Insertps: |
| 31 | // - bits 6 and 7 of the immediate indicate which source item to select (0..3) |
| 32 | // - bits 4 and 5 of the immediate indicate which target item to insert into (0..3) |
| 33 | // - bits 0 to 3 of the immediate indicate which target item to zero |
| 34 | #define INSERTPS_SOURCE_SELECT(i) (i << 6) |
| 35 | #define INSERTPS_TARGET_SELECT(i) (i << 4) |
| 36 | #define INSERTPS_ZERO(i) (1 << i) |
| 37 | |
| 38 | // getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic |
| 39 | // |
| 40 | // Arguments: |
| 41 | // intrinsicId - SIMD intrinsic Id |
| 42 | // baseType - Base type of the SIMD vector |
| 43 | // immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode |
| 44 | // |
| 45 | // |
| 46 | // Return Value: |
| 47 | // Instruction (op) to be used, and immed is set if instruction requires an immediate operand. |
| 48 | // |
| 49 | instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/) |
| 50 | { |
| 51 | // Minimal required instruction set is SSE2. |
| 52 | assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported); |
| 53 | |
| 54 | instruction result = INS_invalid; |
| 55 | switch (intrinsicId) |
| 56 | { |
| 57 | case SIMDIntrinsicInit: |
| 58 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
| 59 | { |
| 60 | // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory. |
| 61 | // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg. |
| 62 | // If we decide to use AVX2 only, we can remove this assert. |
| 63 | if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2)) |
| 64 | { |
| 65 | assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE); |
| 66 | } |
| 67 | switch (baseType) |
| 68 | { |
| 69 | case TYP_FLOAT: |
| 70 | result = INS_vbroadcastss; |
| 71 | break; |
| 72 | case TYP_DOUBLE: |
| 73 | result = INS_vbroadcastsd; |
| 74 | break; |
| 75 | case TYP_ULONG: |
| 76 | case TYP_LONG: |
| 77 | // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed |
| 78 | // to be TYP_LONG reg. |
| 79 | result = INS_vpbroadcastq; |
| 80 | break; |
| 81 | case TYP_UINT: |
| 82 | case TYP_INT: |
| 83 | result = INS_vpbroadcastd; |
| 84 | break; |
| 85 | case TYP_USHORT: |
| 86 | case TYP_SHORT: |
| 87 | result = INS_vpbroadcastw; |
| 88 | break; |
| 89 | case TYP_UBYTE: |
| 90 | case TYP_BYTE: |
| 91 | result = INS_vpbroadcastb; |
| 92 | break; |
| 93 | default: |
| 94 | unreached(); |
| 95 | } |
| 96 | break; |
| 97 | } |
| 98 | |
| 99 | // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic. |
| 100 | __fallthrough; |
| 101 | |
| 102 | case SIMDIntrinsicShuffleSSE2: |
| 103 | if (baseType == TYP_FLOAT) |
| 104 | { |
| 105 | result = INS_shufps; |
| 106 | } |
| 107 | else if (baseType == TYP_DOUBLE) |
| 108 | { |
| 109 | result = INS_shufpd; |
| 110 | } |
| 111 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
| 112 | { |
| 113 | result = INS_pshufd; |
| 114 | } |
| 115 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
| 116 | { |
| 117 | // We don't have a separate SSE2 instruction and will |
| 118 | // use the instruction meant for doubles since it is |
| 119 | // of the same size as a long. |
| 120 | result = INS_shufpd; |
| 121 | } |
| 122 | break; |
| 123 | |
| 124 | case SIMDIntrinsicSqrt: |
| 125 | if (baseType == TYP_FLOAT) |
| 126 | { |
| 127 | result = INS_sqrtps; |
| 128 | } |
| 129 | else if (baseType == TYP_DOUBLE) |
| 130 | { |
| 131 | result = INS_sqrtpd; |
| 132 | } |
| 133 | else |
| 134 | { |
| 135 | unreached(); |
| 136 | } |
| 137 | break; |
| 138 | |
| 139 | case SIMDIntrinsicAdd: |
| 140 | if (baseType == TYP_FLOAT) |
| 141 | { |
| 142 | result = INS_addps; |
| 143 | } |
| 144 | else if (baseType == TYP_DOUBLE) |
| 145 | { |
| 146 | result = INS_addpd; |
| 147 | } |
| 148 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
| 149 | { |
| 150 | result = INS_paddd; |
| 151 | } |
| 152 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
| 153 | { |
| 154 | result = INS_paddw; |
| 155 | } |
| 156 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
| 157 | { |
| 158 | result = INS_paddb; |
| 159 | } |
| 160 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
| 161 | { |
| 162 | result = INS_paddq; |
| 163 | } |
| 164 | break; |
| 165 | |
| 166 | case SIMDIntrinsicSub: |
| 167 | if (baseType == TYP_FLOAT) |
| 168 | { |
| 169 | result = INS_subps; |
| 170 | } |
| 171 | else if (baseType == TYP_DOUBLE) |
| 172 | { |
| 173 | result = INS_subpd; |
| 174 | } |
| 175 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
| 176 | { |
| 177 | result = INS_psubd; |
| 178 | } |
| 179 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
| 180 | { |
| 181 | result = INS_psubw; |
| 182 | } |
| 183 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
| 184 | { |
| 185 | result = INS_psubb; |
| 186 | } |
| 187 | else if (baseType == TYP_LONG || baseType == TYP_ULONG) |
| 188 | { |
| 189 | result = INS_psubq; |
| 190 | } |
| 191 | break; |
| 192 | |
| 193 | case SIMDIntrinsicMul: |
| 194 | if (baseType == TYP_FLOAT) |
| 195 | { |
| 196 | result = INS_mulps; |
| 197 | } |
| 198 | else if (baseType == TYP_DOUBLE) |
| 199 | { |
| 200 | result = INS_mulpd; |
| 201 | } |
| 202 | else if (baseType == TYP_SHORT) |
| 203 | { |
| 204 | result = INS_pmullw; |
| 205 | } |
| 206 | else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
| 207 | { |
| 208 | result = INS_pmulld; |
| 209 | } |
| 210 | break; |
| 211 | |
| 212 | case SIMDIntrinsicDiv: |
| 213 | if (baseType == TYP_FLOAT) |
| 214 | { |
| 215 | result = INS_divps; |
| 216 | } |
| 217 | else if (baseType == TYP_DOUBLE) |
| 218 | { |
| 219 | result = INS_divpd; |
| 220 | } |
| 221 | else |
| 222 | { |
| 223 | unreached(); |
| 224 | } |
| 225 | break; |
| 226 | |
| 227 | case SIMDIntrinsicMin: |
| 228 | if (baseType == TYP_FLOAT) |
| 229 | { |
| 230 | result = INS_minps; |
| 231 | } |
| 232 | else if (baseType == TYP_DOUBLE) |
| 233 | { |
| 234 | result = INS_minpd; |
| 235 | } |
| 236 | else if (baseType == TYP_UBYTE) |
| 237 | { |
| 238 | result = INS_pminub; |
| 239 | } |
| 240 | else if (baseType == TYP_SHORT) |
| 241 | { |
| 242 | result = INS_pminsw; |
| 243 | } |
| 244 | else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
| 245 | { |
| 246 | if (baseType == TYP_BYTE) |
| 247 | { |
| 248 | result = INS_pminsb; |
| 249 | } |
| 250 | else if (baseType == TYP_USHORT) |
| 251 | { |
| 252 | result = INS_pminuw; |
| 253 | } |
| 254 | else if (baseType == TYP_INT) |
| 255 | { |
| 256 | result = INS_pminsd; |
| 257 | } |
| 258 | else if (baseType == TYP_UINT) |
| 259 | { |
| 260 | result = INS_pminud; |
| 261 | } |
| 262 | } |
| 263 | else |
| 264 | { |
| 265 | unreached(); |
| 266 | } |
| 267 | break; |
| 268 | |
| 269 | case SIMDIntrinsicMax: |
| 270 | if (baseType == TYP_FLOAT) |
| 271 | { |
| 272 | result = INS_maxps; |
| 273 | } |
| 274 | else if (baseType == TYP_DOUBLE) |
| 275 | { |
| 276 | result = INS_maxpd; |
| 277 | } |
| 278 | else if (baseType == TYP_UBYTE) |
| 279 | { |
| 280 | result = INS_pmaxub; |
| 281 | } |
| 282 | else if (baseType == TYP_SHORT) |
| 283 | { |
| 284 | result = INS_pmaxsw; |
| 285 | } |
| 286 | else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
| 287 | { |
| 288 | if (baseType == TYP_BYTE) |
| 289 | { |
| 290 | result = INS_pmaxsb; |
| 291 | } |
| 292 | else if (baseType == TYP_USHORT) |
| 293 | { |
| 294 | result = INS_pmaxuw; |
| 295 | } |
| 296 | else if (baseType == TYP_INT) |
| 297 | { |
| 298 | result = INS_pmaxsd; |
| 299 | } |
| 300 | else if (baseType == TYP_UINT) |
| 301 | { |
| 302 | result = INS_pmaxud; |
| 303 | } |
| 304 | } |
| 305 | else |
| 306 | { |
| 307 | unreached(); |
| 308 | } |
| 309 | break; |
| 310 | |
| 311 | case SIMDIntrinsicAbs: |
| 312 | if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
| 313 | { |
| 314 | if (baseType == TYP_INT) |
| 315 | { |
| 316 | result = INS_pabsd; |
| 317 | } |
| 318 | else if (baseType == TYP_SHORT) |
| 319 | { |
| 320 | result = INS_pabsw; |
| 321 | } |
| 322 | else if (baseType == TYP_BYTE) |
| 323 | { |
| 324 | result = INS_pabsb; |
| 325 | } |
| 326 | } |
| 327 | break; |
| 328 | |
| 329 | case SIMDIntrinsicEqual: |
| 330 | if (baseType == TYP_FLOAT) |
| 331 | { |
| 332 | result = INS_cmpps; |
| 333 | assert(ival != nullptr); |
| 334 | *ival = 0; |
| 335 | } |
| 336 | else if (baseType == TYP_DOUBLE) |
| 337 | { |
| 338 | result = INS_cmppd; |
| 339 | assert(ival != nullptr); |
| 340 | *ival = 0; |
| 341 | } |
| 342 | else if (baseType == TYP_INT || baseType == TYP_UINT) |
| 343 | { |
| 344 | result = INS_pcmpeqd; |
| 345 | } |
| 346 | else if (baseType == TYP_USHORT || baseType == TYP_SHORT) |
| 347 | { |
| 348 | result = INS_pcmpeqw; |
| 349 | } |
| 350 | else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) |
| 351 | { |
| 352 | result = INS_pcmpeqb; |
| 353 | } |
| 354 | else if ((baseType == TYP_ULONG || baseType == TYP_LONG) && |
| 355 | (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
| 356 | { |
| 357 | result = INS_pcmpeqq; |
| 358 | } |
| 359 | break; |
| 360 | |
| 361 | case SIMDIntrinsicLessThan: |
| 362 | // Packed integers use > with swapped operands |
| 363 | assert(baseType != TYP_INT); |
| 364 | |
| 365 | if (baseType == TYP_FLOAT) |
| 366 | { |
| 367 | result = INS_cmpps; |
| 368 | assert(ival != nullptr); |
| 369 | *ival = 1; |
| 370 | } |
| 371 | else if (baseType == TYP_DOUBLE) |
| 372 | { |
| 373 | result = INS_cmppd; |
| 374 | assert(ival != nullptr); |
| 375 | *ival = 1; |
| 376 | } |
| 377 | break; |
| 378 | |
| 379 | case SIMDIntrinsicLessThanOrEqual: |
| 380 | // Packed integers use (a==b) || ( b > a) in place of a <= b. |
| 381 | assert(baseType != TYP_INT); |
| 382 | |
| 383 | if (baseType == TYP_FLOAT) |
| 384 | { |
| 385 | result = INS_cmpps; |
| 386 | assert(ival != nullptr); |
| 387 | *ival = 2; |
| 388 | } |
| 389 | else if (baseType == TYP_DOUBLE) |
| 390 | { |
| 391 | result = INS_cmppd; |
| 392 | assert(ival != nullptr); |
| 393 | *ival = 2; |
| 394 | } |
| 395 | break; |
| 396 | |
| 397 | case SIMDIntrinsicGreaterThan: |
| 398 | // Packed float/double use < with swapped operands |
| 399 | assert(!varTypeIsFloating(baseType)); |
| 400 | |
| 401 | // SSE2 supports only signed > |
| 402 | if (baseType == TYP_INT) |
| 403 | { |
| 404 | result = INS_pcmpgtd; |
| 405 | } |
| 406 | else if (baseType == TYP_SHORT) |
| 407 | { |
| 408 | result = INS_pcmpgtw; |
| 409 | } |
| 410 | else if (baseType == TYP_BYTE) |
| 411 | { |
| 412 | result = INS_pcmpgtb; |
| 413 | } |
| 414 | else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) |
| 415 | { |
| 416 | result = INS_pcmpgtq; |
| 417 | } |
| 418 | break; |
| 419 | |
| 420 | case SIMDIntrinsicBitwiseAnd: |
| 421 | if (baseType == TYP_FLOAT) |
| 422 | { |
| 423 | result = INS_andps; |
| 424 | } |
| 425 | else if (baseType == TYP_DOUBLE) |
| 426 | { |
| 427 | result = INS_andpd; |
| 428 | } |
| 429 | else if (varTypeIsIntegral(baseType)) |
| 430 | { |
| 431 | result = INS_pand; |
| 432 | } |
| 433 | break; |
| 434 | |
| 435 | case SIMDIntrinsicBitwiseAndNot: |
| 436 | if (baseType == TYP_FLOAT) |
| 437 | { |
| 438 | result = INS_andnps; |
| 439 | } |
| 440 | else if (baseType == TYP_DOUBLE) |
| 441 | { |
| 442 | result = INS_andnpd; |
| 443 | } |
| 444 | else if (baseType == TYP_INT) |
| 445 | { |
| 446 | result = INS_pandn; |
| 447 | } |
| 448 | else if (varTypeIsIntegral(baseType)) |
| 449 | { |
| 450 | result = INS_pandn; |
| 451 | } |
| 452 | break; |
| 453 | |
| 454 | case SIMDIntrinsicBitwiseOr: |
| 455 | if (baseType == TYP_FLOAT) |
| 456 | { |
| 457 | result = INS_orps; |
| 458 | } |
| 459 | else if (baseType == TYP_DOUBLE) |
| 460 | { |
| 461 | result = INS_orpd; |
| 462 | } |
| 463 | else if (varTypeIsIntegral(baseType)) |
| 464 | { |
| 465 | result = INS_por; |
| 466 | } |
| 467 | break; |
| 468 | |
| 469 | case SIMDIntrinsicBitwiseXor: |
| 470 | if (baseType == TYP_FLOAT) |
| 471 | { |
| 472 | result = INS_xorps; |
| 473 | } |
| 474 | else if (baseType == TYP_DOUBLE) |
| 475 | { |
| 476 | result = INS_xorpd; |
| 477 | } |
| 478 | else if (varTypeIsIntegral(baseType)) |
| 479 | { |
| 480 | result = INS_pxor; |
| 481 | } |
| 482 | break; |
| 483 | |
| 484 | case SIMDIntrinsicCast: |
| 485 | result = INS_movaps; |
| 486 | break; |
| 487 | |
| 488 | case SIMDIntrinsicConvertToSingle: |
| 489 | result = INS_cvtdq2ps; |
| 490 | break; |
| 491 | |
| 492 | case SIMDIntrinsicConvertToDouble: |
| 493 | assert(baseType == TYP_LONG); |
| 494 | result = INS_cvtsi2sd; |
| 495 | break; |
| 496 | |
| 497 | case SIMDIntrinsicConvertToInt32: |
| 498 | assert(baseType == TYP_FLOAT); |
| 499 | result = INS_cvttps2dq; |
| 500 | break; |
| 501 | |
| 502 | case SIMDIntrinsicConvertToInt64: |
| 503 | assert(baseType == TYP_DOUBLE); |
| 504 | result = INS_cvttsd2si; |
| 505 | break; |
| 506 | |
| 507 | case SIMDIntrinsicNarrow: |
| 508 | // Note that for the integer types the caller must zero the upper bits of |
| 509 | // each source element, since the instructions saturate. |
| 510 | switch (baseType) |
| 511 | { |
| 512 | case TYP_INT: |
| 513 | case TYP_UINT: |
| 514 | if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) |
| 515 | { |
| 516 | result = INS_packusdw; |
| 517 | } |
| 518 | else |
| 519 | { |
| 520 | result = INS_packssdw; |
| 521 | } |
| 522 | break; |
| 523 | case TYP_SHORT: |
| 524 | case TYP_USHORT: |
| 525 | result = INS_packuswb; |
| 526 | break; |
| 527 | default: |
| 528 | assert(!"Invalid baseType for SIMDIntrinsicNarrow" ); |
| 529 | result = INS_invalid; |
| 530 | break; |
| 531 | } |
| 532 | break; |
| 533 | |
| 534 | case SIMDIntrinsicWidenLo: |
| 535 | // Some of these have multiple instruction implementations, with one instruction to widen the lo half, |
| 536 | // and another to widen the hi half. |
| 537 | switch (baseType) |
| 538 | { |
| 539 | case TYP_FLOAT: |
| 540 | result = INS_cvtps2pd; |
| 541 | break; |
| 542 | case TYP_INT: |
| 543 | case TYP_UINT: |
| 544 | result = INS_punpckldq; |
| 545 | break; |
| 546 | case TYP_SHORT: |
| 547 | case TYP_USHORT: |
| 548 | result = INS_punpcklwd; |
| 549 | break; |
| 550 | case TYP_BYTE: |
| 551 | case TYP_UBYTE: |
| 552 | result = INS_punpcklbw; |
| 553 | break; |
| 554 | default: |
| 555 | assert(!"Invalid baseType for SIMDIntrinsicWidenLo" ); |
| 556 | result = INS_invalid; |
| 557 | break; |
| 558 | } |
| 559 | break; |
| 560 | |
| 561 | case SIMDIntrinsicWidenHi: |
| 562 | switch (baseType) |
| 563 | { |
| 564 | case TYP_FLOAT: |
| 565 | // For this case, we actually use the same instruction. |
| 566 | result = INS_cvtps2pd; |
| 567 | break; |
| 568 | case TYP_INT: |
| 569 | case TYP_UINT: |
| 570 | result = INS_punpckhdq; |
| 571 | break; |
| 572 | case TYP_SHORT: |
| 573 | case TYP_USHORT: |
| 574 | result = INS_punpckhwd; |
| 575 | break; |
| 576 | case TYP_BYTE: |
| 577 | case TYP_UBYTE: |
| 578 | result = INS_punpckhbw; |
| 579 | break; |
| 580 | default: |
| 581 | assert(!"Invalid baseType for SIMDIntrinsicWidenHi" ); |
| 582 | result = INS_invalid; |
| 583 | break; |
| 584 | } |
| 585 | break; |
| 586 | |
| 587 | case SIMDIntrinsicShiftLeftInternal: |
| 588 | switch (baseType) |
| 589 | { |
| 590 | case TYP_SIMD16: |
| 591 | // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. |
| 592 | result = INS_pslldq; |
| 593 | break; |
| 594 | case TYP_UINT: |
| 595 | case TYP_INT: |
| 596 | result = INS_pslld; |
| 597 | break; |
| 598 | case TYP_SHORT: |
| 599 | case TYP_USHORT: |
| 600 | result = INS_psllw; |
| 601 | break; |
| 602 | default: |
| 603 | assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal" ); |
| 604 | result = INS_invalid; |
| 605 | break; |
| 606 | } |
| 607 | break; |
| 608 | |
| 609 | case SIMDIntrinsicShiftRightInternal: |
| 610 | switch (baseType) |
| 611 | { |
| 612 | case TYP_SIMD16: |
| 613 | // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. |
| 614 | result = INS_psrldq; |
| 615 | break; |
| 616 | case TYP_UINT: |
| 617 | case TYP_INT: |
| 618 | result = INS_psrld; |
| 619 | break; |
| 620 | case TYP_SHORT: |
| 621 | case TYP_USHORT: |
| 622 | result = INS_psrlw; |
| 623 | break; |
| 624 | default: |
| 625 | assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal" ); |
| 626 | result = INS_invalid; |
| 627 | break; |
| 628 | } |
| 629 | break; |
| 630 | |
| 631 | case SIMDIntrinsicUpperSave: |
| 632 | result = INS_vextractf128; |
| 633 | break; |
| 634 | |
| 635 | case SIMDIntrinsicUpperRestore: |
| 636 | result = INS_insertps; |
| 637 | break; |
| 638 | |
| 639 | default: |
| 640 | assert(!"Unsupported SIMD intrinsic" ); |
| 641 | unreached(); |
| 642 | } |
| 643 | |
| 644 | noway_assert(result != INS_invalid); |
| 645 | return result; |
| 646 | } |
| 647 | |
| 648 | // genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg |
| 649 | // to target mm reg, zeroing out the upper bits if and only if specified. |
| 650 | // |
| 651 | // Arguments: |
| 652 | // targetType the target type |
| 653 | // baseType the base type of value to be moved |
| 654 | // targetReg the target reg |
| 655 | // srcReg the src reg |
| 656 | // moveType action to be performed on target upper bits |
| 657 | // |
| 658 | // Return Value: |
| 659 | // None |
| 660 | // |
| 661 | // Notes: |
| 662 | // This is currently only supported for floating point types. |
| 663 | // |
| 664 | void CodeGen::genSIMDScalarMove( |
| 665 | var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType) |
| 666 | { |
| 667 | assert(varTypeIsFloating(baseType)); |
| 668 | switch (moveType) |
| 669 | { |
| 670 | case SMT_PreserveUpper: |
| 671 | if (srcReg != targetReg) |
| 672 | { |
| 673 | instruction ins = ins_Store(baseType); |
| 674 | if (getEmitter()->IsDstSrcSrcAVXInstruction(ins)) |
| 675 | { |
| 676 | // In general, when we use a three-operands move instruction, we want to merge the src with |
| 677 | // itself. This is an exception in that we actually want the "merge" behavior, so we must |
| 678 | // specify it with all 3 operands. |
| 679 | inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType)); |
| 680 | } |
| 681 | else |
| 682 | { |
| 683 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
| 684 | } |
| 685 | } |
| 686 | break; |
| 687 | |
| 688 | case SMT_ZeroInitUpper: |
| 689 | if (compiler->canUseVexEncoding()) |
| 690 | { |
| 691 | // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want. |
| 692 | // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose |
| 693 | // to zero all but the lower bits. |
| 694 | unsigned int insertpsImm = |
| 695 | (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3)); |
| 696 | inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm); |
| 697 | } |
| 698 | else |
| 699 | { |
| 700 | if (srcReg == targetReg) |
| 701 | { |
| 702 | // There is no guarantee that upper bits of op1Reg are zero. |
| 703 | // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes. |
| 704 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 705 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); |
| 706 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 707 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); |
| 708 | } |
| 709 | else |
| 710 | { |
| 711 | genSIMDZero(targetType, TYP_FLOAT, targetReg); |
| 712 | inst_RV_RV(ins_Store(baseType), targetReg, srcReg); |
| 713 | } |
| 714 | } |
| 715 | break; |
| 716 | |
| 717 | case SMT_ZeroInitUpper_SrcHasUpperZeros: |
| 718 | if (srcReg != targetReg) |
| 719 | { |
| 720 | instruction ins = ins_Copy(baseType); |
| 721 | assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins)); |
| 722 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
| 723 | } |
| 724 | break; |
| 725 | |
| 726 | default: |
| 727 | unreached(); |
| 728 | } |
| 729 | } |
| 730 | |
| 731 | void CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg) |
| 732 | { |
| 733 | // We just use `INS_xorps` instead of `getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType)` |
| 734 | // since `genSIMDZero` is used for both `System.Numerics.Vectors` and HardwareIntrinsics. Modern |
| 735 | // CPUs handle this specially in the renamer and it never hits the execution pipeline, additionally |
| 736 | // `INS_xorps` is always available (when using either the legacy or VEX encoding). |
| 737 | inst_RV_RV(INS_xorps, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
| 738 | } |
| 739 | |
| 740 | //------------------------------------------------------------------------ |
| 741 | // genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize. |
| 742 | // |
| 743 | // Arguments: |
| 744 | // simdNode - The GT_SIMD node |
| 745 | // |
| 746 | // Return Value: |
| 747 | // None. |
| 748 | // |
| 749 | void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode) |
| 750 | { |
| 751 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit); |
| 752 | |
| 753 | GenTree* op1 = simdNode->gtGetOp1(); |
| 754 | var_types baseType = simdNode->gtSIMDBaseType; |
| 755 | regNumber targetReg = simdNode->gtRegNum; |
| 756 | assert(targetReg != REG_NA); |
| 757 | var_types targetType = simdNode->TypeGet(); |
| 758 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 759 | unsigned size = simdNode->gtSIMDSize; |
| 760 | |
| 761 | // Should never see small int base type vectors except for zero initialization. |
| 762 | noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0)); |
| 763 | |
| 764 | instruction ins = INS_invalid; |
| 765 | |
| 766 | #if !defined(_TARGET_64BIT_) |
| 767 | if (op1->OperGet() == GT_LONG) |
| 768 | { |
| 769 | assert(varTypeIsLong(baseType)); |
| 770 | |
| 771 | GenTree* op1lo = op1->gtGetOp1(); |
| 772 | GenTree* op1hi = op1->gtGetOp2(); |
| 773 | |
| 774 | if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) |
| 775 | { |
| 776 | genSIMDZero(targetType, baseType, targetReg); |
| 777 | } |
| 778 | else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)) |
| 779 | { |
| 780 | // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg. |
| 781 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT); |
| 782 | inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
| 783 | } |
| 784 | else |
| 785 | { |
| 786 | // Generate: |
| 787 | // mov_i2xmm targetReg, op1lo |
| 788 | // mov_i2xmm xmmtmp, op1hi |
| 789 | // shl xmmtmp, 4 bytes |
| 790 | // por targetReg, xmmtmp |
| 791 | // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using: |
| 792 | // shufpd targetReg, targetReg, 0 // move the long to all the lanes |
| 793 | // For AVX2, move it to all 4 of the 64-bit lanes using: |
| 794 | // vpbroadcastq targetReg, targetReg |
| 795 | |
| 796 | instruction ins; |
| 797 | |
| 798 | regNumber op1loReg = genConsumeReg(op1lo); |
| 799 | ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT); |
| 800 | inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT)); |
| 801 | |
| 802 | regNumber tmpReg = simdNode->GetSingleTempReg(); |
| 803 | |
| 804 | regNumber op1hiReg = genConsumeReg(op1hi); |
| 805 | ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT); |
| 806 | inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT)); |
| 807 | |
| 808 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 809 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes |
| 810 | |
| 811 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType); |
| 812 | inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 813 | |
| 814 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
| 815 | { |
| 816 | inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32)); |
| 817 | } |
| 818 | else |
| 819 | { |
| 820 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType); |
| 821 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0); |
| 822 | } |
| 823 | } |
| 824 | } |
| 825 | else |
| 826 | #endif // !defined(_TARGET_64BIT_) |
| 827 | if (op1->isContained()) |
| 828 | { |
| 829 | if (op1->IsIntegralConst(0) || op1->IsFPZero()) |
| 830 | { |
| 831 | genSIMDZero(targetType, baseType, targetReg); |
| 832 | } |
| 833 | else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1)) |
| 834 | { |
| 835 | // case of initializing elements of vector with all 1's |
| 836 | // generate pcmpeqd reg, reg |
| 837 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT); |
| 838 | inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType)); |
| 839 | } |
| 840 | else |
| 841 | { |
| 842 | assert(level == SIMD_AVX2_Supported); |
| 843 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType); |
| 844 | if (op1->IsCnsFltOrDbl()) |
| 845 | { |
| 846 | getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1); |
| 847 | } |
| 848 | else if (op1->OperIsLocalAddr()) |
| 849 | { |
| 850 | unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0; |
| 851 | getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, |
| 852 | offset); |
| 853 | } |
| 854 | else |
| 855 | { |
| 856 | unreached(); |
| 857 | } |
| 858 | } |
| 859 | } |
| 860 | else if (level == SIMD_AVX2_Supported && ((size == 32) || (size == 16))) |
| 861 | { |
| 862 | regNumber srcReg = genConsumeReg(op1); |
| 863 | if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG) |
| 864 | { |
| 865 | ins = ins_CopyIntToFloat(baseType, TYP_FLOAT); |
| 866 | assert(ins != INS_invalid); |
| 867 | inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); |
| 868 | srcReg = targetReg; |
| 869 | } |
| 870 | |
| 871 | ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 872 | getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg); |
| 873 | } |
| 874 | else |
| 875 | { |
| 876 | // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType. |
| 877 | // In either case we are going to use the SSE2 shuffle instruction. |
| 878 | |
| 879 | regNumber op1Reg = genConsumeReg(op1); |
| 880 | unsigned shuffleControl = 0; |
| 881 | |
| 882 | if (compiler->isSubRegisterSIMDType(simdNode)) |
| 883 | { |
| 884 | assert(baseType == TYP_FLOAT); |
| 885 | |
| 886 | // We cannot assume that upper bits of op1Reg or targetReg be zero. |
| 887 | // Therefore we need to explicitly zero out upper bits. This is |
| 888 | // essential for the shuffle operation performed below. |
| 889 | // |
| 890 | // If op1 is a float/double constant, we would have loaded it from |
| 891 | // data section using movss/sd. Similarly if op1 is a memory op we |
| 892 | // would have loaded it using movss/sd. Movss/sd when loading a xmm reg |
| 893 | // from memory would zero-out upper bits. In these cases we can |
| 894 | // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently |
| 895 | // if they are not the same. |
| 896 | SIMDScalarMoveType moveType = |
| 897 | op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper; |
| 898 | |
| 899 | genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType); |
| 900 | |
| 901 | if (size == 8) |
| 902 | { |
| 903 | shuffleControl = 0x50; |
| 904 | } |
| 905 | else if (size == 12) |
| 906 | { |
| 907 | shuffleControl = 0x40; |
| 908 | } |
| 909 | else |
| 910 | { |
| 911 | noway_assert(!"Unexpected size for SIMD type" ); |
| 912 | } |
| 913 | } |
| 914 | else // Vector<T> |
| 915 | { |
| 916 | if (op1Reg != targetReg) |
| 917 | { |
| 918 | if (varTypeIsFloating(baseType)) |
| 919 | { |
| 920 | ins = ins_Copy(targetType); |
| 921 | } |
| 922 | else if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG) |
| 923 | { |
| 924 | ins = ins_CopyIntToFloat(baseType, TYP_FLOAT); |
| 925 | } |
| 926 | |
| 927 | assert(ins != INS_invalid); |
| 928 | inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType)); |
| 929 | } |
| 930 | } |
| 931 | |
| 932 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType); |
| 933 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl); |
| 934 | } |
| 935 | |
| 936 | genProduceReg(simdNode); |
| 937 | } |
| 938 | |
| 939 | //------------------------------------------------------------------------------------------- |
| 940 | // genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes |
| 941 | // a number of arguments equal to the length of the Vector. |
| 942 | // |
| 943 | // Arguments: |
| 944 | // simdNode - The GT_SIMD node |
| 945 | // |
| 946 | // Return Value: |
| 947 | // None. |
| 948 | // |
| 949 | void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode) |
| 950 | { |
| 951 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN); |
| 952 | |
| 953 | // Right now this intrinsic is supported only on TYP_FLOAT vectors |
| 954 | var_types baseType = simdNode->gtSIMDBaseType; |
| 955 | noway_assert(baseType == TYP_FLOAT); |
| 956 | |
| 957 | regNumber targetReg = simdNode->gtRegNum; |
| 958 | assert(targetReg != REG_NA); |
| 959 | |
| 960 | var_types targetType = simdNode->TypeGet(); |
| 961 | |
| 962 | // Note that we cannot use targetReg before consumed all source operands. Therefore, |
| 963 | // Need an internal register to stitch together all the values into a single vector |
| 964 | // in an XMM reg. |
| 965 | regNumber vectorReg = simdNode->GetSingleTempReg(); |
| 966 | |
| 967 | // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size. |
| 968 | // For example in case of Vector4f we don't need to zero when using SSE2. |
| 969 | if (compiler->isSubRegisterSIMDType(simdNode)) |
| 970 | { |
| 971 | genSIMDZero(targetType, baseType, vectorReg); |
| 972 | } |
| 973 | |
| 974 | unsigned int baseTypeSize = genTypeSize(baseType); |
| 975 | instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 976 | |
| 977 | // We will first consume the list items in execution (left to right) order, |
| 978 | // and record the registers. |
| 979 | regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT]; |
| 980 | unsigned initCount = 0; |
| 981 | for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2()) |
| 982 | { |
| 983 | assert(list->OperGet() == GT_LIST); |
| 984 | GenTree* listItem = list->gtGetOp1(); |
| 985 | assert(listItem->TypeGet() == baseType); |
| 986 | assert(!listItem->isContained()); |
| 987 | regNumber operandReg = genConsumeReg(listItem); |
| 988 | operandRegs[initCount] = operandReg; |
| 989 | initCount++; |
| 990 | } |
| 991 | |
| 992 | unsigned int offset = 0; |
| 993 | for (unsigned i = 0; i < initCount; i++) |
| 994 | { |
| 995 | // We will now construct the vector from the list items in reverse order. |
| 996 | // This allows us to efficiently stitch together a vector as follows: |
| 997 | // vectorReg = (vectorReg << offset) |
| 998 | // VectorReg[0] = listItemReg |
| 999 | // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper |
| 1000 | // bits of vectorReg are not modified. |
| 1001 | |
| 1002 | regNumber operandReg = operandRegs[initCount - i - 1]; |
| 1003 | if (offset != 0) |
| 1004 | { |
| 1005 | getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize); |
| 1006 | } |
| 1007 | genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper); |
| 1008 | |
| 1009 | offset += baseTypeSize; |
| 1010 | } |
| 1011 | |
| 1012 | noway_assert(offset == simdNode->gtSIMDSize); |
| 1013 | |
| 1014 | // Load the initialized value. |
| 1015 | if (targetReg != vectorReg) |
| 1016 | { |
| 1017 | inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType)); |
| 1018 | } |
| 1019 | genProduceReg(simdNode); |
| 1020 | } |
| 1021 | |
| 1022 | //---------------------------------------------------------------------------------- |
| 1023 | // genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt. |
| 1024 | // |
| 1025 | // Arguments: |
| 1026 | // simdNode - The GT_SIMD node |
| 1027 | // |
| 1028 | // Return Value: |
| 1029 | // None. |
| 1030 | // |
| 1031 | void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode) |
| 1032 | { |
| 1033 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast || |
| 1034 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAbs); |
| 1035 | |
| 1036 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1037 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1038 | regNumber targetReg = simdNode->gtRegNum; |
| 1039 | assert(targetReg != REG_NA); |
| 1040 | var_types targetType = simdNode->TypeGet(); |
| 1041 | |
| 1042 | regNumber op1Reg = genConsumeReg(op1); |
| 1043 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 1044 | if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg) |
| 1045 | { |
| 1046 | inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1047 | } |
| 1048 | genProduceReg(simdNode); |
| 1049 | } |
| 1050 | |
| 1051 | //---------------------------------------------------------------------------------- |
| 1052 | // genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float) |
| 1053 | // |
| 1054 | // Arguments: |
| 1055 | // simdNode - The GT_SIMD node |
| 1056 | // |
| 1057 | // Return Value: |
| 1058 | // None. |
| 1059 | // |
| 1060 | void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode) |
| 1061 | { |
| 1062 | SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; |
| 1063 | assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32)); |
| 1064 | |
| 1065 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1066 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1067 | regNumber targetReg = simdNode->gtRegNum; |
| 1068 | assert(targetReg != REG_NA); |
| 1069 | var_types targetType = simdNode->TypeGet(); |
| 1070 | |
| 1071 | regNumber op1Reg = genConsumeReg(op1); |
| 1072 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 1073 | if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT) |
| 1074 | { |
| 1075 | regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
| 1076 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1077 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1078 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg); |
| 1079 | |
| 1080 | // We will generate the following: |
| 1081 | // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) |
| 1082 | // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) |
| 1083 | // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg) |
| 1084 | // vpslld tmpReg2, 16 |
| 1085 | // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2) |
| 1086 | // mov tmpIntReg, 0x5300000053000000 |
| 1087 | // vmovd tmpReg, tmpIntReg |
| 1088 | // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src) |
| 1089 | // vorps targetReg, tmpReg |
| 1090 | // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg) |
| 1091 | // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2) |
| 1092 | // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits) |
| 1093 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType)); |
| 1094 | if (targetReg != op1Reg) |
| 1095 | { |
| 1096 | inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType)); |
| 1097 | } |
| 1098 | |
| 1099 | // prepare upper 16 bits |
| 1100 | getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16); |
| 1101 | |
| 1102 | // prepare lower 16 bits |
| 1103 | getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16); |
| 1104 | getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16); |
| 1105 | |
| 1106 | // prepare mask |
| 1107 | #ifdef _TARGET_AMD64_ |
| 1108 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000); |
| 1109 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
| 1110 | #else |
| 1111 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
| 1112 | { |
| 1113 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000); |
| 1114 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
| 1115 | } |
| 1116 | else |
| 1117 | { |
| 1118 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300); |
| 1119 | inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1120 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1); |
| 1121 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3); |
| 1122 | } |
| 1123 | #endif |
| 1124 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
| 1125 | { |
| 1126 | inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1127 | } |
| 1128 | else |
| 1129 | { |
| 1130 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1131 | } |
| 1132 | |
| 1133 | // convert upper 16 bits |
| 1134 | inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1135 | inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1136 | |
| 1137 | // convert lower 16 bits |
| 1138 | inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType)); |
| 1139 | |
| 1140 | // add lower 16 bits and upper 16 bits |
| 1141 | inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType)); |
| 1142 | } |
| 1143 | else |
| 1144 | { |
| 1145 | inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1146 | } |
| 1147 | genProduceReg(simdNode); |
| 1148 | } |
| 1149 | |
| 1150 | //---------------------------------------------------------------------------------- |
| 1151 | // genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double) |
| 1152 | // |
| 1153 | // Arguments: |
| 1154 | // intrinsicID the SIMD intrinsic ID |
| 1155 | // simdType the SIMD node type |
| 1156 | // baseType the base type of value to be converted |
| 1157 | // tmpReg the tmp reg |
| 1158 | // tmpIntReg the tmp integer reg |
| 1159 | // targetReg the target reg |
| 1160 | // |
| 1161 | // Return Value: |
| 1162 | // None. |
| 1163 | // |
| 1164 | void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID, |
| 1165 | var_types simdType, |
| 1166 | var_types baseType, |
| 1167 | regNumber tmpReg, |
| 1168 | regNumber tmpIntReg, |
| 1169 | regNumber targetReg) |
| 1170 | { |
| 1171 | instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType); |
| 1172 | if (intrinsicID == SIMDIntrinsicConvertToDouble) |
| 1173 | { |
| 1174 | // Note that for mov_xmm2i, the int register is always in the reg2 position |
| 1175 | inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG); |
| 1176 | inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType)); |
| 1177 | } |
| 1178 | else |
| 1179 | { |
| 1180 | inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType)); |
| 1181 | inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG); |
| 1182 | } |
| 1183 | } |
| 1184 | |
| 1185 | //---------------------------------------------------------------------------------- |
| 1186 | // genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double) |
| 1187 | // |
| 1188 | // Arguments: |
| 1189 | // simdNode - The GT_SIMD node |
| 1190 | // |
| 1191 | // Notes: |
| 1192 | // There are no instructions for converting to/from 64-bit integers, so for these we |
| 1193 | // do the conversion an element at a time. |
| 1194 | // |
| 1195 | void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode) |
| 1196 | { |
| 1197 | SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; |
| 1198 | assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64)); |
| 1199 | |
| 1200 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1201 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1202 | regNumber targetReg = simdNode->gtRegNum; |
| 1203 | assert(targetReg != REG_NA); |
| 1204 | var_types simdType = simdNode->TypeGet(); |
| 1205 | regNumber op1Reg = genConsumeReg(op1); |
| 1206 | regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
| 1207 | regNumber tmpReg; |
| 1208 | regNumber tmpReg2; |
| 1209 | regNumber tmpReg3; |
| 1210 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 1211 | |
| 1212 | #ifdef _TARGET_X86_ |
| 1213 | if (baseType == TYP_LONG) |
| 1214 | { |
| 1215 | tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1216 | tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1217 | tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1218 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg); |
| 1219 | } |
| 1220 | else |
| 1221 | #endif |
| 1222 | if (level == SIMD_AVX2_Supported || (baseType == TYP_ULONG)) |
| 1223 | { |
| 1224 | tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1225 | tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1226 | tmpReg3 = REG_NA; |
| 1227 | assert(tmpReg != op1Reg && tmpReg2 != op1Reg); |
| 1228 | } |
| 1229 | else |
| 1230 | { |
| 1231 | tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1232 | assert(tmpReg != op1Reg); |
| 1233 | tmpReg2 = REG_NA; |
| 1234 | tmpReg3 = REG_NA; |
| 1235 | } |
| 1236 | |
| 1237 | if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG)) |
| 1238 | { |
| 1239 | // We will generate the following |
| 1240 | // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) |
| 1241 | // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) |
| 1242 | // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg) |
| 1243 | // vpsllq tmpReg2, 32 |
| 1244 | // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2) |
| 1245 | // mov tmpIntReg, 0x4530000000000000 |
| 1246 | // vmovd tmpReg, tmpIntReg |
| 1247 | // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src) |
| 1248 | // vorpd targetReg, tmpReg |
| 1249 | // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg) |
| 1250 | // mov tmpIntReg, 0x4330000000000000 |
| 1251 | // vmovd tmpReg, tmpIntReg |
| 1252 | // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src) |
| 1253 | // vorpd tmpReg2, tmpReg |
| 1254 | // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2) |
| 1255 | // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together) |
| 1256 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); |
| 1257 | if (targetReg != op1Reg) |
| 1258 | { |
| 1259 | inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType)); |
| 1260 | } |
| 1261 | |
| 1262 | // prepare upper 32 bits |
| 1263 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); |
| 1264 | |
| 1265 | // prepare lower 32 bits |
| 1266 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); |
| 1267 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); |
| 1268 | |
| 1269 | // prepare mask for converting upper 32 bits |
| 1270 | #ifdef _TARGET_AMD64_ |
| 1271 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000); |
| 1272 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
| 1273 | #else |
| 1274 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); |
| 1275 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
| 1276 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
| 1277 | #endif |
| 1278 | if (level == SIMD_AVX2_Supported) |
| 1279 | { |
| 1280 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1281 | } |
| 1282 | else |
| 1283 | { |
| 1284 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1285 | } |
| 1286 | |
| 1287 | // convert upper 32 bits |
| 1288 | inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1289 | inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1290 | |
| 1291 | // prepare mask for converting lower 32 bits |
| 1292 | #ifdef _TARGET_AMD64_ |
| 1293 | getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000); |
| 1294 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); |
| 1295 | #else |
| 1296 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); |
| 1297 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
| 1298 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
| 1299 | #endif |
| 1300 | if (level == SIMD_AVX2_Supported) |
| 1301 | { |
| 1302 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1303 | } |
| 1304 | else |
| 1305 | { |
| 1306 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1307 | } |
| 1308 | |
| 1309 | // convert lower 32 bits |
| 1310 | inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1311 | inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1312 | |
| 1313 | // add lower 32 bits and upper 32 bits |
| 1314 | inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); |
| 1315 | } |
| 1316 | else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG)) |
| 1317 | { |
| 1318 | #ifdef _TARGET_AMD64_ |
| 1319 | instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 1320 | instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 1321 | |
| 1322 | if (level == SIMD_AVX2_Supported) |
| 1323 | { |
| 1324 | // Extract the high 16-bits |
| 1325 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
| 1326 | |
| 1327 | // Put v[3] (the high-order element) in tmpReg2 and convert it. |
| 1328 | inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1329 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
| 1330 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); |
| 1331 | |
| 1332 | // Shift the resulting 64-bits left. |
| 1333 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
| 1334 | |
| 1335 | // Convert v[2], in the lo bits of tmpReg. |
| 1336 | // For the convert to double, the convert preserves the upper bits in tmpReg2. |
| 1337 | // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. |
| 1338 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2); |
| 1339 | } |
| 1340 | |
| 1341 | // Put v[1] in tmpReg. |
| 1342 | inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); |
| 1343 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
| 1344 | |
| 1345 | // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. |
| 1346 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
| 1347 | |
| 1348 | // Shift the resulting 64-bits left. |
| 1349 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
| 1350 | |
| 1351 | // Convert the lo 64-bits into targetReg |
| 1352 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg); |
| 1353 | |
| 1354 | // Merge or copy the results (only at this point are we done with op1Reg). |
| 1355 | if (tmpReg != targetReg) |
| 1356 | { |
| 1357 | inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1358 | } |
| 1359 | |
| 1360 | if (level == SIMD_AVX2_Supported) |
| 1361 | { |
| 1362 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01); |
| 1363 | } |
| 1364 | #else |
| 1365 | // get the sign bit and put it in tmpReg3 |
| 1366 | inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType)); |
| 1367 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63); |
| 1368 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63); |
| 1369 | |
| 1370 | // get the absolute value of src and put it into tmpReg2 and targetReg |
| 1371 | inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); |
| 1372 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY); |
| 1373 | getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32); |
| 1374 | inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); |
| 1375 | inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); |
| 1376 | inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType)); |
| 1377 | |
| 1378 | // prepare upper 32 bits |
| 1379 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); |
| 1380 | |
| 1381 | // prepare lower 32 bits |
| 1382 | getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); |
| 1383 | getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); |
| 1384 | |
| 1385 | // prepare mask for converting upper 32 bits |
| 1386 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); |
| 1387 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
| 1388 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
| 1389 | |
| 1390 | if (level == SIMD_AVX2_Supported) |
| 1391 | { |
| 1392 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1393 | } |
| 1394 | else |
| 1395 | { |
| 1396 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1397 | } |
| 1398 | |
| 1399 | // convert upper 32 bits |
| 1400 | inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1401 | inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1402 | |
| 1403 | // prepare mask for converting lower 32 bits |
| 1404 | getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); |
| 1405 | inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); |
| 1406 | getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); |
| 1407 | |
| 1408 | if (level == SIMD_AVX2_Supported) |
| 1409 | { |
| 1410 | inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1411 | } |
| 1412 | else |
| 1413 | { |
| 1414 | inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1415 | } |
| 1416 | |
| 1417 | // convert lower 32 bits |
| 1418 | inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1419 | inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1420 | |
| 1421 | // add lower 32 bits and upper 32 bits |
| 1422 | inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); |
| 1423 | |
| 1424 | // add sign bit |
| 1425 | inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType)); |
| 1426 | #endif |
| 1427 | } |
| 1428 | else |
| 1429 | { |
| 1430 | instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 1431 | instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 1432 | |
| 1433 | if (level == SIMD_AVX2_Supported) |
| 1434 | { |
| 1435 | // Extract the high 16-bits |
| 1436 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
| 1437 | |
| 1438 | // Put v[3] (the high-order element) in tmpReg2 and convert it. |
| 1439 | inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1440 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
| 1441 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); |
| 1442 | |
| 1443 | // Shift the resulting 64-bits left. |
| 1444 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); |
| 1445 | |
| 1446 | // Convert v[2], in the lo bits of tmpReg. |
| 1447 | // For the convert to double, the convert preserves the upper bits in tmpReg2. |
| 1448 | // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. |
| 1449 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
| 1450 | inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1451 | } |
| 1452 | |
| 1453 | // Put v[1] in tmpReg. |
| 1454 | inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); |
| 1455 | getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
| 1456 | |
| 1457 | // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. |
| 1458 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); |
| 1459 | |
| 1460 | // Shift the resulting 64-bits left. |
| 1461 | getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); |
| 1462 | |
| 1463 | // Convert the lo 64-bits into targetReg |
| 1464 | genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg); |
| 1465 | |
| 1466 | // Merge or copy the results (only at this point are we done with op1Reg). |
| 1467 | assert(tmpReg != targetReg); |
| 1468 | inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); |
| 1469 | if (level == SIMD_AVX2_Supported) |
| 1470 | { |
| 1471 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01); |
| 1472 | } |
| 1473 | } |
| 1474 | genProduceReg(simdNode); |
| 1475 | } |
| 1476 | |
| 1477 | //-------------------------------------------------------------------------------- |
| 1478 | // genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register |
| 1479 | // |
| 1480 | // Arguments: |
| 1481 | // simdNode - The GT_SIMD node |
| 1482 | // |
| 1483 | // Notes: |
| 1484 | // This is used for the WidenHi intrinsic to extract the upper half. |
| 1485 | // On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes. |
| 1486 | // |
| 1487 | void CodeGen::(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg) |
| 1488 | { |
| 1489 | var_types simdType = simdNode->TypeGet(); |
| 1490 | emitAttr emitSize = emitActualTypeSize(simdType); |
| 1491 | if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) |
| 1492 | { |
| 1493 | instruction = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128; |
| 1494 | getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01); |
| 1495 | } |
| 1496 | else |
| 1497 | { |
| 1498 | instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 1499 | if (tgtReg != srcReg) |
| 1500 | { |
| 1501 | inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize); |
| 1502 | } |
| 1503 | getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8); |
| 1504 | } |
| 1505 | } |
| 1506 | |
| 1507 | //-------------------------------------------------------------------------------- |
| 1508 | // genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations |
| 1509 | // |
| 1510 | // Arguments: |
| 1511 | // simdNode - The GT_SIMD node |
| 1512 | // |
| 1513 | // Notes: |
| 1514 | // The Widen intrinsics are broken into separate intrinsics for the two results. |
| 1515 | // |
| 1516 | void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) |
| 1517 | { |
| 1518 | assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || |
| 1519 | (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); |
| 1520 | |
| 1521 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1522 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1523 | regNumber targetReg = simdNode->gtRegNum; |
| 1524 | assert(targetReg != REG_NA); |
| 1525 | var_types simdType = simdNode->TypeGet(); |
| 1526 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 1527 | |
| 1528 | genConsumeOperands(simdNode); |
| 1529 | regNumber op1Reg = op1->gtRegNum; |
| 1530 | regNumber srcReg = op1Reg; |
| 1531 | emitAttr emitSize = emitActualTypeSize(simdType); |
| 1532 | instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 1533 | |
| 1534 | if (baseType == TYP_FLOAT) |
| 1535 | { |
| 1536 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) |
| 1537 | { |
| 1538 | genSIMDExtractUpperHalf(simdNode, srcReg, targetReg); |
| 1539 | srcReg = targetReg; |
| 1540 | } |
| 1541 | inst_RV_RV(widenIns, targetReg, srcReg, simdType); |
| 1542 | } |
| 1543 | else |
| 1544 | { |
| 1545 | // We will generate the following on AVX: |
| 1546 | // vpermq targetReg, op1Reg, 0xd4|0xe8 |
| 1547 | // vpxor tmpReg, tmpReg |
| 1548 | // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed) |
| 1549 | // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg |
| 1550 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1551 | assert(tmpReg != op1Reg); |
| 1552 | |
| 1553 | if (level == SIMD_AVX2_Supported) |
| 1554 | { |
| 1555 | // permute op1Reg and put it into targetReg |
| 1556 | unsigned ival = 0xd4; |
| 1557 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) |
| 1558 | { |
| 1559 | ival = 0xe8; |
| 1560 | } |
| 1561 | getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival); |
| 1562 | } |
| 1563 | else if (targetReg != op1Reg) |
| 1564 | { |
| 1565 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); |
| 1566 | } |
| 1567 | |
| 1568 | genSIMDZero(simdType, baseType, tmpReg); |
| 1569 | if (!varTypeIsUnsigned(baseType)) |
| 1570 | { |
| 1571 | instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType); |
| 1572 | inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize); |
| 1573 | } |
| 1574 | inst_RV_RV(widenIns, targetReg, tmpReg, simdType); |
| 1575 | } |
| 1576 | genProduceReg(simdNode); |
| 1577 | } |
| 1578 | |
| 1579 | //-------------------------------------------------------------------------------- |
| 1580 | // genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations |
| 1581 | // |
| 1582 | // Arguments: |
| 1583 | // simdNode - The GT_SIMD node |
| 1584 | // |
| 1585 | // Notes: |
| 1586 | // This intrinsic takes two arguments. The first operand is narrowed to produce the |
| 1587 | // lower elements of the results, and the second operand produces the high elements. |
| 1588 | // |
| 1589 | void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) |
| 1590 | { |
| 1591 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); |
| 1592 | |
| 1593 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1594 | GenTree* op2 = simdNode->gtGetOp2(); |
| 1595 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1596 | regNumber targetReg = simdNode->gtRegNum; |
| 1597 | assert(targetReg != REG_NA); |
| 1598 | var_types simdType = simdNode->TypeGet(); |
| 1599 | emitAttr emitSize = emitTypeSize(simdType); |
| 1600 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 1601 | |
| 1602 | genConsumeOperands(simdNode); |
| 1603 | regNumber op1Reg = op1->gtRegNum; |
| 1604 | regNumber op2Reg = op2->gtRegNum; |
| 1605 | if (baseType == TYP_DOUBLE) |
| 1606 | { |
| 1607 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1608 | |
| 1609 | inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); |
| 1610 | inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); |
| 1611 | // Now insert the high-order result (in tmpReg) into the upper half of targetReg. |
| 1612 | if (level == SIMD_AVX2_Supported) |
| 1613 | { |
| 1614 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); |
| 1615 | } |
| 1616 | else |
| 1617 | { |
| 1618 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX); |
| 1619 | } |
| 1620 | } |
| 1621 | else if (varTypeIsLong(baseType)) |
| 1622 | { |
| 1623 | if (level == SIMD_AVX2_Supported) |
| 1624 | { |
| 1625 | // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg. |
| 1626 | // We will generate the following: |
| 1627 | // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg) |
| 1628 | // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2) |
| 1629 | // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg) |
| 1630 | // mov tmpReg2, op1Reg |
| 1631 | // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2) |
| 1632 | // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg |
| 1633 | // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg |
| 1634 | // punpcklqdq tgtReg, tmpReg |
| 1635 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1636 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1637 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); |
| 1638 | getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); |
| 1639 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01); |
| 1640 | inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize); |
| 1641 | getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); |
| 1642 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX); |
| 1643 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX); |
| 1644 | inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize); |
| 1645 | } |
| 1646 | else |
| 1647 | { |
| 1648 | // We will generate the following: |
| 1649 | // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements) |
| 1650 | // psrldq targetReg, 8 (shift them right to get zeros in the high elements) |
| 1651 | // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements) |
| 1652 | // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements) |
| 1653 | // por targetReg, tmpReg |
| 1654 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1655 | instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 1656 | instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 1657 | emitAttr emitSize = emitTypeSize(simdType); |
| 1658 | |
| 1659 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX); |
| 1660 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8); |
| 1661 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX); |
| 1662 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8); |
| 1663 | inst_RV_RV(INS_por, targetReg, tmpReg, simdType); |
| 1664 | } |
| 1665 | } |
| 1666 | else |
| 1667 | { |
| 1668 | // We will generate the following: |
| 1669 | // mov targetReg, op1Reg |
| 1670 | // mov tmpReg, op2Reg |
| 1671 | // psll? targetReg, shiftCount |
| 1672 | // pslr? targetReg, shiftCount |
| 1673 | // psll? tmpReg, shiftCount |
| 1674 | // pslr? tmpReg, shiftCount |
| 1675 | // <pack> targetReg, tmpReg |
| 1676 | // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType), |
| 1677 | // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to |
| 1678 | // get CLR type semantics; otherwise it will saturate). |
| 1679 | // |
| 1680 | int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2); |
| 1681 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 1682 | instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); |
| 1683 | instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); |
| 1684 | |
| 1685 | if (level == SIMD_AVX2_Supported) |
| 1686 | { |
| 1687 | regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); |
| 1688 | regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1689 | |
| 1690 | // The AVX instructions generally operate on "lanes", so we have to permute the |
| 1691 | // inputs so that the destination register has the low 128-bit halves of the two |
| 1692 | // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs. |
| 1693 | getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20); |
| 1694 | getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31); |
| 1695 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount); |
| 1696 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount); |
| 1697 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); |
| 1698 | getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount); |
| 1699 | inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType)); |
| 1700 | } |
| 1701 | else |
| 1702 | { |
| 1703 | regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 1704 | |
| 1705 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); |
| 1706 | inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize); |
| 1707 | |
| 1708 | instruction tmpShiftRight = shiftRightIns; |
| 1709 | if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported) |
| 1710 | { |
| 1711 | tmpShiftRight = INS_psrad; |
| 1712 | } |
| 1713 | |
| 1714 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount); |
| 1715 | getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount); |
| 1716 | getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); |
| 1717 | getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount); |
| 1718 | inst_RV_RV(ins, targetReg, tmpReg, simdType); |
| 1719 | } |
| 1720 | } |
| 1721 | genProduceReg(simdNode); |
| 1722 | } |
| 1723 | |
| 1724 | //-------------------------------------------------------------------------------- |
| 1725 | // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations |
| 1726 | // add, sub, mul, bit-wise And, AndNot and Or. |
| 1727 | // |
| 1728 | // Arguments: |
| 1729 | // simdNode - The GT_SIMD node |
| 1730 | // |
| 1731 | // Return Value: |
| 1732 | // None. |
| 1733 | // |
| 1734 | void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) |
| 1735 | { |
| 1736 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || |
| 1737 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv || |
| 1738 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || |
| 1739 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot || |
| 1740 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr || |
| 1741 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin || |
| 1742 | simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax); |
| 1743 | |
| 1744 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1745 | GenTree* op2 = simdNode->gtGetOp2(); |
| 1746 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1747 | regNumber targetReg = simdNode->gtRegNum; |
| 1748 | assert(targetReg != REG_NA); |
| 1749 | var_types targetType = simdNode->TypeGet(); |
| 1750 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 1751 | |
| 1752 | genConsumeOperands(simdNode); |
| 1753 | regNumber op1Reg = op1->gtRegNum; |
| 1754 | regNumber op2Reg = op2->gtRegNum; |
| 1755 | regNumber otherReg = op2Reg; |
| 1756 | |
| 1757 | // Vector<Int>.Mul: |
| 1758 | // SSE2 doesn't have an instruction to perform this operation directly |
| 1759 | // whereas SSE4.1 does (pmulld). This is special cased and computed |
| 1760 | // as follows. |
| 1761 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported) |
| 1762 | { |
| 1763 | // We need a temporary register that is NOT the same as the target, |
| 1764 | // and we MAY need another. |
| 1765 | regNumber tmpReg = simdNode->ExtractTempReg(); |
| 1766 | regNumber tmpReg2 = simdNode->GetSingleTempReg(); |
| 1767 | |
| 1768 | // The register allocator guarantees the following conditions: |
| 1769 | // - the only registers that may be the same among op1Reg, op2Reg, tmpReg |
| 1770 | // and tmpReg2 are op1Reg and op2Reg. |
| 1771 | // Let's be extra-careful and assert that now. |
| 1772 | assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) && |
| 1773 | (tmpReg != tmpReg2)); |
| 1774 | |
| 1775 | // We will start by setting things up so that: |
| 1776 | // - We have op1 in op1Reg and targetReg, and they are different registers. |
| 1777 | // - We have op2 in op2Reg and tmpReg |
| 1778 | // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified, |
| 1779 | // OR they are the targetReg that will be produced. |
| 1780 | // (Note that in the code we generate below op1Reg and op2Reg are never written.) |
| 1781 | // We will copy things as necessary to ensure that this is the case. |
| 1782 | // Note that we can swap op1 and op2, since multiplication is commutative. |
| 1783 | // We will not modify the values in op1Reg and op2Reg. |
| 1784 | // (Though note that if either op1 or op2 is the same as targetReg, we will make |
| 1785 | // a copy and use that copy as the input register. In that case we WILL modify |
| 1786 | // the original value in the register, but will wind up with the result in targetReg |
| 1787 | // in the end, as expected.) |
| 1788 | |
| 1789 | // First, we need a tmpReg that is NOT the same as targetReg. |
| 1790 | // Note that if we have another reg that is the same as targetReg, |
| 1791 | // we can use tmpReg2 for that case, as we will not have hit this case. |
| 1792 | if (tmpReg == targetReg) |
| 1793 | { |
| 1794 | tmpReg = tmpReg2; |
| 1795 | } |
| 1796 | |
| 1797 | if (op2Reg == targetReg) |
| 1798 | { |
| 1799 | // We will swap the operands. |
| 1800 | // Since the code below only deals with registers, this now becomes the case where |
| 1801 | // op1Reg == targetReg. |
| 1802 | op2Reg = op1Reg; |
| 1803 | op1Reg = targetReg; |
| 1804 | } |
| 1805 | if (op1Reg == targetReg) |
| 1806 | { |
| 1807 | // Copy op1, and make tmpReg2 the new op1Reg. |
| 1808 | // Note that those regs can't be the same, as we asserted above. |
| 1809 | // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit |
| 1810 | // the "tmpReg == targetReg" case. |
| 1811 | inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1812 | op1Reg = tmpReg2; |
| 1813 | inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
| 1814 | // However, we have one more case to worry about: what if op2Reg is also targetReg |
| 1815 | // (i.e. we have the same operand as op1 and op2)? |
| 1816 | // In that case we will set op2Reg to the same register as op1Reg. |
| 1817 | if (op2Reg == targetReg) |
| 1818 | { |
| 1819 | op2Reg = tmpReg2; |
| 1820 | } |
| 1821 | } |
| 1822 | else |
| 1823 | { |
| 1824 | // Copy op1 to targetReg and op2 to tmpReg. |
| 1825 | inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1826 | inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
| 1827 | } |
| 1828 | // Let's assert that things are as we expect. |
| 1829 | // - We have op1 in op1Reg and targetReg, and they are different registers. |
| 1830 | assert(op1Reg != targetReg); |
| 1831 | // - We have op2 in op2Reg and tmpReg, and they are different registers. |
| 1832 | assert(op2Reg != tmpReg); |
| 1833 | // - Either we are going to leave op1's reg unmodified, or it is the targetReg. |
| 1834 | assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg)); |
| 1835 | // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg. |
| 1836 | assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg)); |
| 1837 | |
| 1838 | // Now we can generate the code. |
| 1839 | |
| 1840 | // targetReg = op1 >> 4-bytes (op1 is already in targetReg) |
| 1841 | getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4); |
| 1842 | |
| 1843 | // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg) |
| 1844 | getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4); |
| 1845 | |
| 1846 | // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially |
| 1847 | // tmpReg[63:0] = op1[1] * op2[1] |
| 1848 | // tmpReg[127:64] = op1[3] * op2[3] |
| 1849 | inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType)); |
| 1850 | |
| 1851 | // Extract first and third double word results from tmpReg |
| 1852 | // tmpReg = shuffle(0,0,2,0) of tmpReg |
| 1853 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX); |
| 1854 | |
| 1855 | // targetReg[63:0] = op1[0] * op2[0] |
| 1856 | // targetReg[127:64] = op1[2] * op2[2] |
| 1857 | inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1858 | inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType)); |
| 1859 | |
| 1860 | // Extract first and third double word results from targetReg |
| 1861 | // targetReg = shuffle(0,0,2,0) of targetReg |
| 1862 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX); |
| 1863 | |
| 1864 | // pack the results into a single vector |
| 1865 | inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); |
| 1866 | } |
| 1867 | else |
| 1868 | { |
| 1869 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 1870 | |
| 1871 | // Currently AVX doesn't support integer. |
| 1872 | // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. |
| 1873 | if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && |
| 1874 | !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins)) |
| 1875 | { |
| 1876 | inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); |
| 1877 | } |
| 1878 | else |
| 1879 | { |
| 1880 | if (op2Reg == targetReg) |
| 1881 | { |
| 1882 | otherReg = op1Reg; |
| 1883 | } |
| 1884 | else if (op1Reg != targetReg) |
| 1885 | { |
| 1886 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1887 | } |
| 1888 | |
| 1889 | inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); |
| 1890 | } |
| 1891 | } |
| 1892 | |
| 1893 | // Vector2/3 div: since the top-most elements will be zero, we end up |
| 1894 | // perfoming 0/0 which is a NAN. Therefore, post division we need to set the |
| 1895 | // top-most elements to zero. This is achieved by left logical shift followed |
| 1896 | // by right logical shift of targetReg. |
| 1897 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16)) |
| 1898 | { |
| 1899 | // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length. |
| 1900 | unsigned shiftCount = 16 - simdNode->gtSIMDSize; |
| 1901 | assert(shiftCount != 0); |
| 1902 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); |
| 1903 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); |
| 1904 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 1905 | getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); |
| 1906 | } |
| 1907 | |
| 1908 | genProduceReg(simdNode); |
| 1909 | } |
| 1910 | |
| 1911 | //-------------------------------------------------------------------------------- |
| 1912 | // genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater |
| 1913 | // <, <=, >, >= and == |
| 1914 | // |
| 1915 | // Arguments: |
| 1916 | // simdNode - The GT_SIMD node |
| 1917 | // |
| 1918 | // Return Value: |
| 1919 | // None. |
| 1920 | // |
| 1921 | void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) |
| 1922 | { |
| 1923 | GenTree* op1 = simdNode->gtGetOp1(); |
| 1924 | GenTree* op2 = simdNode->gtGetOp2(); |
| 1925 | var_types baseType = simdNode->gtSIMDBaseType; |
| 1926 | regNumber targetReg = simdNode->gtRegNum; |
| 1927 | var_types targetType = simdNode->TypeGet(); |
| 1928 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 1929 | |
| 1930 | genConsumeOperands(simdNode); |
| 1931 | regNumber op1Reg = op1->gtRegNum; |
| 1932 | regNumber op2Reg = op2->gtRegNum; |
| 1933 | regNumber otherReg = op2Reg; |
| 1934 | |
| 1935 | switch (simdNode->gtSIMDIntrinsicID) |
| 1936 | { |
| 1937 | case SIMDIntrinsicEqual: |
| 1938 | case SIMDIntrinsicGreaterThan: |
| 1939 | { |
| 1940 | assert(targetReg != REG_NA); |
| 1941 | |
| 1942 | #ifdef DEBUG |
| 1943 | // SSE2: vector<(u)long> relational op should be implemented in terms of |
| 1944 | // TYP_INT comparison operations |
| 1945 | if (baseType == TYP_LONG || baseType == TYP_ULONG) |
| 1946 | { |
| 1947 | assert(level >= SIMD_SSE4_Supported); |
| 1948 | } |
| 1949 | #endif |
| 1950 | |
| 1951 | // Greater-than: Floating point vectors use "<" with swapped operands |
| 1952 | if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan) |
| 1953 | { |
| 1954 | assert(!varTypeIsFloating(baseType)); |
| 1955 | } |
| 1956 | |
| 1957 | unsigned ival = 0; |
| 1958 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival); |
| 1959 | |
| 1960 | // targetReg = op1reg > op2reg |
| 1961 | // Therefore, we can optimize if op1Reg == targetReg |
| 1962 | otherReg = op2Reg; |
| 1963 | if (op1Reg != targetReg) |
| 1964 | { |
| 1965 | if (op2Reg == targetReg) |
| 1966 | { |
| 1967 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual); |
| 1968 | otherReg = op1Reg; |
| 1969 | } |
| 1970 | else |
| 1971 | { |
| 1972 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 1973 | } |
| 1974 | } |
| 1975 | |
| 1976 | if (varTypeIsFloating(baseType)) |
| 1977 | { |
| 1978 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival); |
| 1979 | } |
| 1980 | else |
| 1981 | { |
| 1982 | inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); |
| 1983 | } |
| 1984 | } |
| 1985 | break; |
| 1986 | |
| 1987 | case SIMDIntrinsicLessThan: |
| 1988 | case SIMDIntrinsicLessThanOrEqual: |
| 1989 | { |
| 1990 | assert(targetReg != REG_NA); |
| 1991 | |
| 1992 | // Int vectors use ">" and ">=" with swapped operands |
| 1993 | assert(varTypeIsFloating(baseType)); |
| 1994 | |
| 1995 | // Get the instruction opcode for compare operation |
| 1996 | unsigned ival; |
| 1997 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival); |
| 1998 | |
| 1999 | // targetReg = op1reg RelOp op2reg |
| 2000 | // Thefore, we can optimize if op1Reg == targetReg |
| 2001 | if (op1Reg != targetReg) |
| 2002 | { |
| 2003 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 2004 | } |
| 2005 | |
| 2006 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival); |
| 2007 | } |
| 2008 | break; |
| 2009 | |
| 2010 | // (In)Equality that produces bool result instead of a bit vector |
| 2011 | case SIMDIntrinsicOpEquality: |
| 2012 | case SIMDIntrinsicOpInEquality: |
| 2013 | { |
| 2014 | // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC. |
| 2015 | assert(targetReg == REG_NA); |
| 2016 | |
| 2017 | var_types simdType = op1->TypeGet(); |
| 2018 | // TODO-1stClassStructs: Temporary to minimize asmDiffs |
| 2019 | if (simdType == TYP_DOUBLE) |
| 2020 | { |
| 2021 | simdType = TYP_SIMD8; |
| 2022 | } |
| 2023 | |
| 2024 | // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16 |
| 2025 | // since both the operands will be in XMM registers. |
| 2026 | if (simdType == TYP_SIMD12) |
| 2027 | { |
| 2028 | simdType = TYP_SIMD16; |
| 2029 | } |
| 2030 | |
| 2031 | // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest. |
| 2032 | if (op2->isContained()) |
| 2033 | { |
| 2034 | assert((compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0)); |
| 2035 | inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); |
| 2036 | } |
| 2037 | else |
| 2038 | { |
| 2039 | // We need one additional SIMD register to store the result of the SIMD compare. |
| 2040 | regNumber tmpReg1 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); |
| 2041 | |
| 2042 | // tmpReg1 = (op1Reg == op2Reg) |
| 2043 | // Call this value of tmpReg1 as 'compResult' for further reference below. |
| 2044 | regNumber otherReg = op2Reg; |
| 2045 | if (tmpReg1 != op2Reg) |
| 2046 | { |
| 2047 | if (tmpReg1 != op1Reg) |
| 2048 | { |
| 2049 | inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); |
| 2050 | } |
| 2051 | } |
| 2052 | else |
| 2053 | { |
| 2054 | otherReg = op1Reg; |
| 2055 | } |
| 2056 | |
| 2057 | // For all integer types we can use TYP_INT comparison. |
| 2058 | unsigned ival = 0; |
| 2059 | instruction ins = |
| 2060 | getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); |
| 2061 | |
| 2062 | if (varTypeIsFloating(baseType)) |
| 2063 | { |
| 2064 | getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); |
| 2065 | } |
| 2066 | else |
| 2067 | { |
| 2068 | inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); |
| 2069 | } |
| 2070 | |
| 2071 | regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT); |
| 2072 | inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); |
| 2073 | // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare |
| 2074 | // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a |
| 2075 | // subset of each component's ones/zeroes. In the end we need to know if the result is |
| 2076 | // "all ones" where the number of ones is given by the vector byte size, not by the |
| 2077 | // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and |
| 2078 | // for SSE registers we need to compare to 0x0000FFFF. |
| 2079 | // The SIMD12 case is handled specially, because we can't rely on the upper bytes being |
| 2080 | // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF). |
| 2081 | // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize |
| 2082 | // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF |
| 2083 | // encoding instead of 83F8FF. |
| 2084 | ssize_t mask; |
| 2085 | if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) |
| 2086 | { |
| 2087 | mask = 0x00000FFF; |
| 2088 | getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask); |
| 2089 | } |
| 2090 | else if (emitActualTypeSize(simdType) == 32) |
| 2091 | { |
| 2092 | mask = -1; |
| 2093 | } |
| 2094 | else |
| 2095 | { |
| 2096 | mask = 0x0000FFFF; |
| 2097 | } |
| 2098 | getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask); |
| 2099 | } |
| 2100 | } |
| 2101 | break; |
| 2102 | |
| 2103 | default: |
| 2104 | noway_assert(!"Unimplemented SIMD relational operation." ); |
| 2105 | unreached(); |
| 2106 | } |
| 2107 | |
| 2108 | genProduceReg(simdNode); |
| 2109 | } |
| 2110 | |
| 2111 | //-------------------------------------------------------------------------------- |
| 2112 | // genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product. |
| 2113 | // |
| 2114 | // Arguments: |
| 2115 | // simdNode - The GT_SIMD node |
| 2116 | // |
| 2117 | // Return Value: |
| 2118 | // None. |
| 2119 | // |
| 2120 | void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) |
| 2121 | { |
| 2122 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct); |
| 2123 | |
| 2124 | GenTree* op1 = simdNode->gtGetOp1(); |
| 2125 | GenTree* op2 = simdNode->gtGetOp2(); |
| 2126 | var_types baseType = simdNode->gtSIMDBaseType; |
| 2127 | var_types simdType = op1->TypeGet(); |
| 2128 | // TODO-1stClassStructs: Temporary to minimize asmDiffs |
| 2129 | if (simdType == TYP_DOUBLE) |
| 2130 | { |
| 2131 | simdType = TYP_SIMD8; |
| 2132 | } |
| 2133 | var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType; |
| 2134 | regNumber targetReg = simdNode->gtRegNum; |
| 2135 | assert(targetReg != REG_NA); |
| 2136 | |
| 2137 | var_types targetType = simdNode->TypeGet(); |
| 2138 | assert(targetType == baseType); |
| 2139 | |
| 2140 | genConsumeOperands(simdNode); |
| 2141 | regNumber op1Reg = op1->gtRegNum; |
| 2142 | regNumber op2Reg = op2->gtRegNum; |
| 2143 | regNumber tmpReg1 = REG_NA; |
| 2144 | regNumber tmpReg2 = REG_NA; |
| 2145 | |
| 2146 | SIMDLevel level = compiler->getSIMDSupportLevel(); |
| 2147 | |
| 2148 | // Dot product intrinsic is supported only on float/double vectors |
| 2149 | // and 32-byte int vectors on AVX. |
| 2150 | // |
| 2151 | // Float/Double Vectors: |
| 2152 | // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register |
| 2153 | // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or |
| 2154 | // smaller on AVX, then we don't need a tmpReg. |
| 2155 | // |
| 2156 | // 32-byte integer vector on AVX: we need two additional Xmm registers |
| 2157 | // different from targetReg as scratch. |
| 2158 | // |
| 2159 | // 16-byte integer vector on SSE4: we need one additional Xmm register |
| 2160 | // different from targetReg as scratch. |
| 2161 | if (varTypeIsFloating(baseType)) |
| 2162 | { |
| 2163 | if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32)) |
| 2164 | { |
| 2165 | tmpReg1 = simdNode->GetSingleTempReg(); |
| 2166 | assert(tmpReg1 != targetReg); |
| 2167 | } |
| 2168 | else |
| 2169 | { |
| 2170 | assert(simdNode->AvailableTempRegCount() == 0); |
| 2171 | } |
| 2172 | } |
| 2173 | else |
| 2174 | { |
| 2175 | assert(baseType == TYP_INT); |
| 2176 | assert(level >= SIMD_SSE4_Supported); |
| 2177 | |
| 2178 | if (level == SIMD_SSE4_Supported) |
| 2179 | { |
| 2180 | tmpReg1 = simdNode->GetSingleTempReg(); |
| 2181 | } |
| 2182 | else |
| 2183 | { |
| 2184 | tmpReg1 = simdNode->ExtractTempReg(); |
| 2185 | tmpReg2 = simdNode->GetSingleTempReg(); |
| 2186 | } |
| 2187 | } |
| 2188 | |
| 2189 | if (level == SIMD_SSE2_Supported) |
| 2190 | { |
| 2191 | // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg |
| 2192 | if (op1Reg == targetReg) |
| 2193 | { |
| 2194 | // Best case |
| 2195 | // nothing to do, we have registers in the right place |
| 2196 | } |
| 2197 | else if (op2Reg == targetReg) |
| 2198 | { |
| 2199 | op2Reg = op1Reg; |
| 2200 | } |
| 2201 | else |
| 2202 | { |
| 2203 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); |
| 2204 | } |
| 2205 | |
| 2206 | // DotProduct(v1, v2) |
| 2207 | // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1 |
| 2208 | if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) |
| 2209 | { |
| 2210 | assert(baseType == TYP_FLOAT); |
| 2211 | // v0 = v1 * v2 |
| 2212 | // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its |
| 2213 | // // position |
| 2214 | // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper |
| 2215 | // // bits |
| 2216 | // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1) |
| 2217 | // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2) |
| 2218 | // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2) |
| 2219 | // |
| 2220 | inst_RV_RV(INS_mulps, targetReg, op2Reg); |
| 2221 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
| 2222 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY); |
| 2223 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
| 2224 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW); |
| 2225 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
| 2226 | } |
| 2227 | else if (baseType == TYP_FLOAT) |
| 2228 | { |
| 2229 | // v0 = v1 * v2 |
| 2230 | // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its |
| 2231 | // // position |
| 2232 | // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1) |
| 2233 | // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1) |
| 2234 | // tmp = v0 |
| 2235 | // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2) |
| 2236 | // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3) |
| 2237 | // // Essentially horizontal addition of all elements. |
| 2238 | // // We could achieve the same using SSEv3 instruction |
| 2239 | // // HADDPS. |
| 2240 | // |
| 2241 | inst_RV_RV(INS_mulps, targetReg, op2Reg); |
| 2242 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
| 2243 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY); |
| 2244 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
| 2245 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
| 2246 | inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW); |
| 2247 | inst_RV_RV(INS_addps, targetReg, tmpReg1); |
| 2248 | } |
| 2249 | else |
| 2250 | { |
| 2251 | assert(baseType == TYP_DOUBLE); |
| 2252 | |
| 2253 | // v0 = v1 * v2 |
| 2254 | // tmp = v0 // v0 = (1, 0) - each element is given by its position |
| 2255 | // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1) |
| 2256 | // v0 = v0 + tmp // v0 = (1+0, 0+1) |
| 2257 | inst_RV_RV(INS_mulpd, targetReg, op2Reg); |
| 2258 | inst_RV_RV(INS_movaps, tmpReg1, targetReg); |
| 2259 | inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01); |
| 2260 | inst_RV_RV(INS_addpd, targetReg, tmpReg1); |
| 2261 | } |
| 2262 | } |
| 2263 | else |
| 2264 | { |
| 2265 | assert(level >= SIMD_SSE4_Supported); |
| 2266 | |
| 2267 | if (varTypeIsFloating(baseType)) |
| 2268 | { |
| 2269 | // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg. |
| 2270 | // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually |
| 2271 | // use the 3-op form, so that we can avoid these copies. |
| 2272 | // TODO-CQ: Add inst_RV_RV_RV_IV(). |
| 2273 | if (op1Reg == targetReg) |
| 2274 | { |
| 2275 | // Best case |
| 2276 | // nothing to do, we have registers in the right place |
| 2277 | } |
| 2278 | else if (op2Reg == targetReg) |
| 2279 | { |
| 2280 | op2Reg = op1Reg; |
| 2281 | } |
| 2282 | else |
| 2283 | { |
| 2284 | inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); |
| 2285 | } |
| 2286 | |
| 2287 | emitAttr emitSize = emitActualTypeSize(simdEvalType); |
| 2288 | if (baseType == TYP_FLOAT) |
| 2289 | { |
| 2290 | // dpps computes the dot product of the upper & lower halves of the 32-byte register. |
| 2291 | // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. |
| 2292 | unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1; |
| 2293 | inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask); |
| 2294 | // dpps computes the dot product of the upper & lower halves of the 32-byte register. |
| 2295 | // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. |
| 2296 | // If this is TYP_SIMD32, we need to combine the lower & upper results. |
| 2297 | if (simdEvalType == TYP_SIMD32) |
| 2298 | { |
| 2299 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); |
| 2300 | inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); |
| 2301 | } |
| 2302 | } |
| 2303 | else if (baseType == TYP_DOUBLE) |
| 2304 | { |
| 2305 | if (simdEvalType == TYP_SIMD32) |
| 2306 | { |
| 2307 | // targetReg = targetReg * op2Reg |
| 2308 | // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves |
| 2309 | // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg |
| 2310 | // targetReg = targetReg + tmpReg1 |
| 2311 | inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType)); |
| 2312 | inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType)); |
| 2313 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); |
| 2314 | inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); |
| 2315 | } |
| 2316 | else |
| 2317 | { |
| 2318 | // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use |
| 2319 | // dppd directly. |
| 2320 | assert(level == SIMD_SSE4_Supported); |
| 2321 | inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31); |
| 2322 | } |
| 2323 | } |
| 2324 | } |
| 2325 | else |
| 2326 | { |
| 2327 | // Dot product of 32-byte int vector on SSE4/AVX. |
| 2328 | assert(baseType == TYP_INT); |
| 2329 | assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32); |
| 2330 | |
| 2331 | #ifdef DEBUG |
| 2332 | // SSE4: We need 1 scratch register. |
| 2333 | // AVX2: We need 2 scratch registers. |
| 2334 | if (simdEvalType == TYP_SIMD16) |
| 2335 | { |
| 2336 | assert(tmpReg1 != REG_NA); |
| 2337 | } |
| 2338 | else |
| 2339 | { |
| 2340 | assert(tmpReg1 != REG_NA); |
| 2341 | assert(tmpReg2 != REG_NA); |
| 2342 | } |
| 2343 | #endif |
| 2344 | |
| 2345 | // tmpReg1 = op1 * op2 |
| 2346 | if (level == SIMD_AVX2_Supported) |
| 2347 | { |
| 2348 | // On AVX take advantage 3 operand form of pmulld |
| 2349 | inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType)); |
| 2350 | } |
| 2351 | else |
| 2352 | { |
| 2353 | inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType); |
| 2354 | inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType); |
| 2355 | } |
| 2356 | |
| 2357 | if (simdEvalType == TYP_SIMD32) |
| 2358 | { |
| 2359 | // tmpReg2[127..0] = Upper 128-bits of tmpReg1 |
| 2360 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); |
| 2361 | |
| 2362 | // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0] |
| 2363 | // This will compute |
| 2364 | // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4] |
| 2365 | // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5] |
| 2366 | // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6] |
| 2367 | // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7] |
| 2368 | inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE); |
| 2369 | } |
| 2370 | |
| 2371 | // This horizontal add will compute |
| 2372 | // |
| 2373 | // TYP_SIMD16: |
| 2374 | // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1] |
| 2375 | // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4] |
| 2376 | // |
| 2377 | // TYP_SIMD32: |
| 2378 | // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5] |
| 2379 | // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7] |
| 2380 | inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); |
| 2381 | |
| 2382 | // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1] |
| 2383 | inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); |
| 2384 | |
| 2385 | // TargetReg = integer result from tmpReg1 |
| 2386 | // (Note that for mov_xmm2i, the int register is always in the reg2 position) |
| 2387 | inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); |
| 2388 | } |
| 2389 | } |
| 2390 | |
| 2391 | genProduceReg(simdNode); |
| 2392 | } |
| 2393 | |
| 2394 | //------------------------------------------------------------------------------------ |
| 2395 | // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. |
| 2396 | // |
| 2397 | // Arguments: |
| 2398 | // simdNode - The GT_SIMD node |
| 2399 | // |
| 2400 | // Return Value: |
| 2401 | // None. |
| 2402 | // |
| 2403 | void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) |
| 2404 | { |
| 2405 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem); |
| 2406 | |
| 2407 | GenTree* op1 = simdNode->gtGetOp1(); |
| 2408 | GenTree* op2 = simdNode->gtGetOp2(); |
| 2409 | var_types simdType = op1->TypeGet(); |
| 2410 | assert(varTypeIsSIMD(simdType)); |
| 2411 | |
| 2412 | // op1 of TYP_SIMD12 should be considered as TYP_SIMD16, |
| 2413 | // since it is in XMM register. |
| 2414 | if (simdType == TYP_SIMD12) |
| 2415 | { |
| 2416 | simdType = TYP_SIMD16; |
| 2417 | } |
| 2418 | |
| 2419 | var_types baseType = simdNode->gtSIMDBaseType; |
| 2420 | regNumber targetReg = simdNode->gtRegNum; |
| 2421 | assert(targetReg != REG_NA); |
| 2422 | var_types targetType = simdNode->TypeGet(); |
| 2423 | assert(targetType == genActualType(baseType)); |
| 2424 | |
| 2425 | // GetItem has 2 operands: |
| 2426 | // - the source of SIMD type (op1) |
| 2427 | // - the index of the value to be returned. |
| 2428 | genConsumeOperands(simdNode); |
| 2429 | regNumber srcReg = op1->gtRegNum; |
| 2430 | |
| 2431 | // Optimize the case of op1 is in memory and trying to access ith element. |
| 2432 | if (!op1->isUsedFromReg()) |
| 2433 | { |
| 2434 | assert(op1->isContained()); |
| 2435 | |
| 2436 | regNumber baseReg; |
| 2437 | regNumber indexReg; |
| 2438 | int offset = 0; |
| 2439 | |
| 2440 | if (op1->OperIsLocal()) |
| 2441 | { |
| 2442 | // There are three parts to the total offset here: |
| 2443 | // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}. |
| 2444 | bool isEBPbased; |
| 2445 | unsigned varNum = op1->gtLclVarCommon.gtLclNum; |
| 2446 | offset += compiler->lvaFrameAddress(varNum, &isEBPbased); |
| 2447 | if (op1->OperGet() == GT_LCL_FLD) |
| 2448 | { |
| 2449 | offset += op1->gtLclFld.gtLclOffs; |
| 2450 | } |
| 2451 | baseReg = (isEBPbased) ? REG_EBP : REG_ESP; |
| 2452 | } |
| 2453 | else |
| 2454 | { |
| 2455 | // Require GT_IND addr to be not contained. |
| 2456 | assert(op1->OperGet() == GT_IND); |
| 2457 | |
| 2458 | GenTree* addr = op1->AsIndir()->Addr(); |
| 2459 | assert(!addr->isContained()); |
| 2460 | baseReg = addr->gtRegNum; |
| 2461 | } |
| 2462 | |
| 2463 | if (op2->isContainedIntOrIImmed()) |
| 2464 | { |
| 2465 | indexReg = REG_NA; |
| 2466 | offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType); |
| 2467 | } |
| 2468 | else |
| 2469 | { |
| 2470 | indexReg = op2->gtRegNum; |
| 2471 | assert(genIsValidIntReg(indexReg)); |
| 2472 | } |
| 2473 | |
| 2474 | // Now, load the desired element. |
| 2475 | getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load |
| 2476 | emitTypeSize(baseType), // Of the vector baseType |
| 2477 | targetReg, // To targetReg |
| 2478 | baseReg, // Base Reg |
| 2479 | indexReg, // Indexed |
| 2480 | genTypeSize(baseType), // by the size of the baseType |
| 2481 | offset); |
| 2482 | genProduceReg(simdNode); |
| 2483 | return; |
| 2484 | } |
| 2485 | |
| 2486 | // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant. |
| 2487 | // For the non-constant case, we will use the SIMD temp location to store the vector, and |
| 2488 | // the load the desired element. |
| 2489 | // The range check will already have been performed, so at this point we know we have an index |
| 2490 | // within the bounds of the vector. |
| 2491 | if (!op2->IsCnsIntOrI()) |
| 2492 | { |
| 2493 | unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; |
| 2494 | noway_assert(simdInitTempVarNum != BAD_VAR_NUM); |
| 2495 | bool isEBPbased; |
| 2496 | unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased); |
| 2497 | regNumber indexReg = op2->gtRegNum; |
| 2498 | |
| 2499 | // Store the vector to the temp location. |
| 2500 | getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)), |
| 2501 | emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0); |
| 2502 | |
| 2503 | // Now, load the desired element. |
| 2504 | getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load |
| 2505 | emitTypeSize(baseType), // Of the vector baseType |
| 2506 | targetReg, // To targetReg |
| 2507 | (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based |
| 2508 | indexReg, // Indexed |
| 2509 | genTypeSize(baseType), // by the size of the baseType |
| 2510 | offs); |
| 2511 | genProduceReg(simdNode); |
| 2512 | return; |
| 2513 | } |
| 2514 | |
| 2515 | noway_assert(op2->isContained()); |
| 2516 | noway_assert(op2->IsCnsIntOrI()); |
| 2517 | unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal; |
| 2518 | unsigned int byteShiftCnt = index * genTypeSize(baseType); |
| 2519 | |
| 2520 | // In general we shouldn't have an index greater than or equal to the length of the vector. |
| 2521 | // However, if we have an out-of-range access, under minOpts it will not be optimized |
| 2522 | // away. The code will throw before we reach this point, but we still need to generate |
| 2523 | // code. In that case, we will simply mask off the upper bits. |
| 2524 | if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength()) |
| 2525 | { |
| 2526 | byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1); |
| 2527 | index = byteShiftCnt / genTypeSize(baseType); |
| 2528 | } |
| 2529 | |
| 2530 | regNumber tmpReg = REG_NA; |
| 2531 | if (simdNode->AvailableTempRegCount() != 0) |
| 2532 | { |
| 2533 | tmpReg = simdNode->GetSingleTempReg(); |
| 2534 | } |
| 2535 | else |
| 2536 | { |
| 2537 | assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) || |
| 2538 | (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16))); |
| 2539 | } |
| 2540 | |
| 2541 | if (byteShiftCnt >= 16) |
| 2542 | { |
| 2543 | assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); |
| 2544 | byteShiftCnt -= 16; |
| 2545 | regNumber newSrcReg; |
| 2546 | if (varTypeIsFloating(baseType)) |
| 2547 | { |
| 2548 | newSrcReg = targetReg; |
| 2549 | } |
| 2550 | else |
| 2551 | { |
| 2552 | // Integer types |
| 2553 | assert(tmpReg != REG_NA); |
| 2554 | newSrcReg = tmpReg; |
| 2555 | } |
| 2556 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01); |
| 2557 | |
| 2558 | srcReg = newSrcReg; |
| 2559 | } |
| 2560 | |
| 2561 | // Generate the following sequence: |
| 2562 | // 1) baseType is floating point |
| 2563 | // movaps targetReg, srcReg |
| 2564 | // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element |
| 2565 | // |
| 2566 | // 2) baseType is not floating point |
| 2567 | // movaps tmpReg, srcReg <-- not generated if accessing zero'th element |
| 2568 | // OR if tmpReg == srcReg |
| 2569 | // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element |
| 2570 | // mov_xmm2i targetReg, tmpReg |
| 2571 | if (varTypeIsFloating(baseType)) |
| 2572 | { |
| 2573 | if (targetReg != srcReg) |
| 2574 | { |
| 2575 | inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType)); |
| 2576 | } |
| 2577 | |
| 2578 | if (byteShiftCnt != 0) |
| 2579 | { |
| 2580 | instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 2581 | getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt); |
| 2582 | } |
| 2583 | } |
| 2584 | else |
| 2585 | { |
| 2586 | if (varTypeIsSmallInt(baseType)) |
| 2587 | { |
| 2588 | // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits. |
| 2589 | // In case of vector<short> we also need to sign extend the 16-bit value in targetReg |
| 2590 | // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right |
| 2591 | // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg. |
| 2592 | |
| 2593 | unsigned baseSize = genTypeSize(baseType); |
| 2594 | if (baseSize == 1) |
| 2595 | { |
| 2596 | index /= 2; |
| 2597 | } |
| 2598 | // We actually want index % 8 for the AVX case (for SSE it will never be > 8). |
| 2599 | // Note that this doesn't matter functionally, because the instruction uses just the |
| 2600 | // low 3 bits of index, but it's better to use the right value. |
| 2601 | if (index > 8) |
| 2602 | { |
| 2603 | assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); |
| 2604 | index -= 8; |
| 2605 | } |
| 2606 | |
| 2607 | getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index); |
| 2608 | |
| 2609 | bool ZeroOrSignExtnReqd = true; |
| 2610 | if (baseSize == 1) |
| 2611 | { |
| 2612 | if ((op2->gtIntCon.gtIconVal % 2) == 1) |
| 2613 | { |
| 2614 | // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element. |
| 2615 | inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8); |
| 2616 | |
| 2617 | // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE |
| 2618 | ZeroOrSignExtnReqd = (baseType == TYP_BYTE); |
| 2619 | } |
| 2620 | // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits |
| 2621 | } |
| 2622 | else |
| 2623 | { |
| 2624 | // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT |
| 2625 | assert(baseSize == 2); |
| 2626 | ZeroOrSignExtnReqd = (baseType == TYP_SHORT); |
| 2627 | } |
| 2628 | |
| 2629 | if (ZeroOrSignExtnReqd) |
| 2630 | { |
| 2631 | // Zero/sign extend the byte/short to 32-bits |
| 2632 | inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType)); |
| 2633 | } |
| 2634 | } |
| 2635 | else |
| 2636 | { |
| 2637 | // We need a temp xmm register if the baseType is not floating point and |
| 2638 | // accessing non-zero'th element. |
| 2639 | instruction ins; |
| 2640 | |
| 2641 | if (byteShiftCnt != 0) |
| 2642 | { |
| 2643 | assert(tmpReg != REG_NA); |
| 2644 | |
| 2645 | if (tmpReg != srcReg) |
| 2646 | { |
| 2647 | inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType)); |
| 2648 | } |
| 2649 | |
| 2650 | ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); |
| 2651 | getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt); |
| 2652 | } |
| 2653 | else |
| 2654 | { |
| 2655 | tmpReg = srcReg; |
| 2656 | } |
| 2657 | |
| 2658 | assert(tmpReg != REG_NA); |
| 2659 | ins = ins_CopyFloatToInt(TYP_FLOAT, baseType); |
| 2660 | // (Note that for mov_xmm2i, the int register is always in the reg2 position.) |
| 2661 | inst_RV_RV(ins, tmpReg, targetReg, baseType); |
| 2662 | } |
| 2663 | } |
| 2664 | |
| 2665 | genProduceReg(simdNode); |
| 2666 | } |
| 2667 | |
| 2668 | //------------------------------------------------------------------------------------ |
| 2669 | // genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i. |
| 2670 | // |
| 2671 | // Arguments: |
| 2672 | // simdNode - The GT_SIMD node |
| 2673 | // |
| 2674 | // Return Value: |
| 2675 | // None. |
| 2676 | // |
| 2677 | // TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case. |
| 2678 | // |
| 2679 | void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode) |
| 2680 | { |
| 2681 | // Determine index based on intrinsic ID |
| 2682 | int index = -1; |
| 2683 | switch (simdNode->gtSIMDIntrinsicID) |
| 2684 | { |
| 2685 | case SIMDIntrinsicSetX: |
| 2686 | index = 0; |
| 2687 | break; |
| 2688 | case SIMDIntrinsicSetY: |
| 2689 | index = 1; |
| 2690 | break; |
| 2691 | case SIMDIntrinsicSetZ: |
| 2692 | index = 2; |
| 2693 | break; |
| 2694 | case SIMDIntrinsicSetW: |
| 2695 | index = 3; |
| 2696 | break; |
| 2697 | |
| 2698 | default: |
| 2699 | unreached(); |
| 2700 | } |
| 2701 | assert(index != -1); |
| 2702 | |
| 2703 | // op1 is the SIMD vector |
| 2704 | // op2 is the value to be set |
| 2705 | GenTree* op1 = simdNode->gtGetOp1(); |
| 2706 | GenTree* op2 = simdNode->gtGetOp2(); |
| 2707 | |
| 2708 | var_types baseType = simdNode->gtSIMDBaseType; |
| 2709 | regNumber targetReg = simdNode->gtRegNum; |
| 2710 | assert(targetReg != REG_NA); |
| 2711 | var_types targetType = simdNode->TypeGet(); |
| 2712 | assert(varTypeIsSIMD(targetType)); |
| 2713 | |
| 2714 | // the following assert must hold. |
| 2715 | // supported only on vector2f/3f/4f right now |
| 2716 | noway_assert(baseType == TYP_FLOAT); |
| 2717 | assert(op2->TypeGet() == baseType); |
| 2718 | assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType))); |
| 2719 | |
| 2720 | genConsumeOperands(simdNode); |
| 2721 | regNumber op1Reg = op1->gtRegNum; |
| 2722 | regNumber op2Reg = op2->gtRegNum; |
| 2723 | |
| 2724 | // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate. |
| 2725 | if (targetReg != op1Reg) |
| 2726 | { |
| 2727 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 2728 | } |
| 2729 | |
| 2730 | // Right now this intrinsic is supported only for float base type vectors. |
| 2731 | // If in future need to support on other base type vectors, the below |
| 2732 | // logic needs modification. |
| 2733 | noway_assert(baseType == TYP_FLOAT); |
| 2734 | |
| 2735 | if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) |
| 2736 | { |
| 2737 | // We need one additional int register as scratch |
| 2738 | regNumber tmpReg = simdNode->GetSingleTempReg(); |
| 2739 | assert(genIsValidIntReg(tmpReg)); |
| 2740 | |
| 2741 | // Move the value from xmm reg to an int reg |
| 2742 | instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT); |
| 2743 | // (Note that for mov_xmm2i, the int register is always in the reg2 position. |
| 2744 | inst_RV_RV(ins, op2Reg, tmpReg, baseType); |
| 2745 | |
| 2746 | // First insert the lower 16-bits of tmpReg in targetReg at 2*index position |
| 2747 | // since every float has two 16-bit words. |
| 2748 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index); |
| 2749 | |
| 2750 | // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position |
| 2751 | inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16); |
| 2752 | getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1); |
| 2753 | } |
| 2754 | else |
| 2755 | { |
| 2756 | unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index)); |
| 2757 | inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm); |
| 2758 | } |
| 2759 | |
| 2760 | genProduceReg(simdNode); |
| 2761 | } |
| 2762 | |
| 2763 | //------------------------------------------------------------------------ |
| 2764 | // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle. |
| 2765 | // |
| 2766 | // Arguments: |
| 2767 | // simdNode - The GT_SIMD node |
| 2768 | // |
| 2769 | // Return Value: |
| 2770 | // None. |
| 2771 | // |
| 2772 | void CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode) |
| 2773 | { |
| 2774 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2); |
| 2775 | noway_assert(compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported); |
| 2776 | |
| 2777 | GenTree* op1 = simdNode->gtGetOp1(); |
| 2778 | GenTree* op2 = simdNode->gtGetOp2(); |
| 2779 | assert(op2->isContained()); |
| 2780 | assert(op2->IsCnsIntOrI()); |
| 2781 | int shuffleControl = (int)op2->AsIntConCommon()->IconValue(); |
| 2782 | var_types baseType = simdNode->gtSIMDBaseType; |
| 2783 | var_types targetType = simdNode->TypeGet(); |
| 2784 | regNumber targetReg = simdNode->gtRegNum; |
| 2785 | assert(targetReg != REG_NA); |
| 2786 | |
| 2787 | regNumber op1Reg = genConsumeReg(op1); |
| 2788 | if (targetReg != op1Reg) |
| 2789 | { |
| 2790 | inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); |
| 2791 | } |
| 2792 | |
| 2793 | instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); |
| 2794 | getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl); |
| 2795 | genProduceReg(simdNode); |
| 2796 | } |
| 2797 | |
| 2798 | //----------------------------------------------------------------------------- |
| 2799 | // genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory. |
| 2800 | // Since Vector3 is not a hardware supported write size, it is performed |
| 2801 | // as two writes: 8 byte followed by 4-byte. |
| 2802 | // |
| 2803 | // Arguments: |
| 2804 | // treeNode - tree node that is attempting to store indirect |
| 2805 | // |
| 2806 | // |
| 2807 | // Return Value: |
| 2808 | // None. |
| 2809 | // |
| 2810 | void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode) |
| 2811 | { |
| 2812 | assert(treeNode->OperGet() == GT_STOREIND); |
| 2813 | |
| 2814 | GenTree* addr = treeNode->gtOp.gtOp1; |
| 2815 | GenTree* data = treeNode->gtOp.gtOp2; |
| 2816 | |
| 2817 | // addr and data should not be contained. |
| 2818 | assert(!data->isContained()); |
| 2819 | assert(!addr->isContained()); |
| 2820 | |
| 2821 | #ifdef DEBUG |
| 2822 | // Should not require a write barrier |
| 2823 | GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data); |
| 2824 | assert(writeBarrierForm == GCInfo::WBF_NoBarrier); |
| 2825 | #endif |
| 2826 | |
| 2827 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
| 2828 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
| 2829 | |
| 2830 | genConsumeOperands(treeNode->AsOp()); |
| 2831 | |
| 2832 | // 8-byte write |
| 2833 | getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0); |
| 2834 | |
| 2835 | // Extract upper 4-bytes from data |
| 2836 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02); |
| 2837 | |
| 2838 | // 4-byte write |
| 2839 | getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8); |
| 2840 | } |
| 2841 | |
| 2842 | //----------------------------------------------------------------------------- |
| 2843 | // genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value. |
| 2844 | // Since Vector3 is not a hardware supported write size, it is performed |
| 2845 | // as two loads: 8 byte followed by 4-byte. |
| 2846 | // |
| 2847 | // Arguments: |
| 2848 | // treeNode - tree node of GT_IND |
| 2849 | // |
| 2850 | // |
| 2851 | // Return Value: |
| 2852 | // None. |
| 2853 | // |
| 2854 | void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode) |
| 2855 | { |
| 2856 | assert(treeNode->OperGet() == GT_IND); |
| 2857 | |
| 2858 | regNumber targetReg = treeNode->gtRegNum; |
| 2859 | GenTree* op1 = treeNode->gtOp.gtOp1; |
| 2860 | assert(!op1->isContained()); |
| 2861 | regNumber operandReg = genConsumeReg(op1); |
| 2862 | |
| 2863 | // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg |
| 2864 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
| 2865 | assert(tmpReg != targetReg); |
| 2866 | |
| 2867 | // Load upper 4 bytes in tmpReg |
| 2868 | getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8); |
| 2869 | |
| 2870 | // Load lower 8 bytes in targetReg |
| 2871 | getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0); |
| 2872 | |
| 2873 | // combine upper 4 bytes and lower 8 bytes in targetReg |
| 2874 | getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); |
| 2875 | |
| 2876 | genProduceReg(treeNode); |
| 2877 | } |
| 2878 | |
| 2879 | //----------------------------------------------------------------------------- |
| 2880 | // genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field. |
| 2881 | // Since Vector3 is not a hardware supported write size, it is performed |
| 2882 | // as two stores: 8 byte followed by 4-byte. |
| 2883 | // |
| 2884 | // Arguments: |
| 2885 | // treeNode - tree node that is attempting to store TYP_SIMD12 field |
| 2886 | // |
| 2887 | // Return Value: |
| 2888 | // None. |
| 2889 | // |
| 2890 | void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode) |
| 2891 | { |
| 2892 | assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR)); |
| 2893 | |
| 2894 | unsigned offs = 0; |
| 2895 | unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; |
| 2896 | assert(varNum < compiler->lvaCount); |
| 2897 | |
| 2898 | if (treeNode->OperGet() == GT_LCL_FLD) |
| 2899 | { |
| 2900 | offs = treeNode->gtLclFld.gtLclOffs; |
| 2901 | } |
| 2902 | |
| 2903 | GenTree* op1 = treeNode->gtOp.gtOp1; |
| 2904 | assert(!op1->isContained()); |
| 2905 | regNumber operandReg = genConsumeReg(op1); |
| 2906 | |
| 2907 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
| 2908 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
| 2909 | |
| 2910 | // store lower 8 bytes |
| 2911 | getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs); |
| 2912 | |
| 2913 | // Extract upper 4-bytes from operandReg |
| 2914 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02); |
| 2915 | |
| 2916 | // Store upper 4 bytes |
| 2917 | getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs + 8); |
| 2918 | } |
| 2919 | |
| 2920 | //----------------------------------------------------------------------------- |
| 2921 | // genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field. |
| 2922 | // Since Vector3 is not a hardware supported read size, it is performed |
| 2923 | // as two reads: 4 byte followed by 8 byte. |
| 2924 | // |
| 2925 | // Arguments: |
| 2926 | // treeNode - tree node that is attempting to load TYP_SIMD12 field |
| 2927 | // |
| 2928 | // Return Value: |
| 2929 | // None. |
| 2930 | // |
| 2931 | void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode) |
| 2932 | { |
| 2933 | assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR)); |
| 2934 | |
| 2935 | regNumber targetReg = treeNode->gtRegNum; |
| 2936 | unsigned offs = 0; |
| 2937 | unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; |
| 2938 | assert(varNum < compiler->lvaCount); |
| 2939 | |
| 2940 | if (treeNode->OperGet() == GT_LCL_FLD) |
| 2941 | { |
| 2942 | offs = treeNode->gtLclFld.gtLclOffs; |
| 2943 | } |
| 2944 | |
| 2945 | // Need an additional Xmm register that is different from targetReg to read upper 4 bytes. |
| 2946 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
| 2947 | assert(tmpReg != targetReg); |
| 2948 | |
| 2949 | // Read upper 4 bytes to tmpReg |
| 2950 | getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs + 8); |
| 2951 | |
| 2952 | // Read lower 8 bytes to targetReg |
| 2953 | getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs); |
| 2954 | |
| 2955 | // combine upper 4 bytes and lower 8 bytes in targetReg |
| 2956 | getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); |
| 2957 | |
| 2958 | genProduceReg(treeNode); |
| 2959 | } |
| 2960 | |
| 2961 | #ifdef _TARGET_X86_ |
| 2962 | |
| 2963 | //----------------------------------------------------------------------------- |
| 2964 | // genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack. |
| 2965 | // Since Vector3 is not a hardware supported write size, it is performed |
| 2966 | // as two stores: 8 byte followed by 4-byte. The stack is assumed to have |
| 2967 | // already been adjusted. |
| 2968 | // |
| 2969 | // Arguments: |
| 2970 | // operandReg - the xmm register containing the SIMD12 to store. |
| 2971 | // tmpReg - an xmm register that can be used as a temporary for the operation. |
| 2972 | // |
| 2973 | // Return Value: |
| 2974 | // None. |
| 2975 | // |
| 2976 | void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg) |
| 2977 | { |
| 2978 | assert(genIsValidFloatReg(operandReg)); |
| 2979 | assert(genIsValidFloatReg(tmpReg)); |
| 2980 | |
| 2981 | // 8-byte write |
| 2982 | getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0); |
| 2983 | |
| 2984 | // Extract upper 4-bytes from data |
| 2985 | getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02); |
| 2986 | |
| 2987 | // 4-byte write |
| 2988 | getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8); |
| 2989 | } |
| 2990 | |
| 2991 | //----------------------------------------------------------------------------- |
| 2992 | // genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field. |
| 2993 | // Since Vector3 is not a hardware supported write size, it is performed |
| 2994 | // as two stores: 8 byte followed by 4-byte. The stack is assumed to have |
| 2995 | // already been adjusted. |
| 2996 | // |
| 2997 | // Arguments: |
| 2998 | // treeNode - tree node that is attempting to store TYP_SIMD12 field |
| 2999 | // |
| 3000 | // Return Value: |
| 3001 | // None. |
| 3002 | // |
| 3003 | void CodeGen::genPutArgStkSIMD12(GenTree* treeNode) |
| 3004 | { |
| 3005 | assert(treeNode->OperGet() == GT_PUTARG_STK); |
| 3006 | |
| 3007 | GenTree* op1 = treeNode->gtOp.gtOp1; |
| 3008 | assert(!op1->isContained()); |
| 3009 | regNumber operandReg = genConsumeReg(op1); |
| 3010 | |
| 3011 | // Need an addtional Xmm register to extract upper 4 bytes from data. |
| 3012 | regNumber tmpReg = treeNode->GetSingleTempReg(); |
| 3013 | |
| 3014 | genStoreSIMD12ToStack(operandReg, tmpReg); |
| 3015 | } |
| 3016 | |
| 3017 | #endif // _TARGET_X86_ |
| 3018 | |
| 3019 | //----------------------------------------------------------------------------- |
| 3020 | // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to |
| 3021 | // the given register, if any, or to memory. |
| 3022 | // |
| 3023 | // Arguments: |
| 3024 | // simdNode - The GT_SIMD node |
| 3025 | // |
| 3026 | // Return Value: |
| 3027 | // None. |
| 3028 | // |
| 3029 | // Notes: |
| 3030 | // The upper half of all AVX registers is volatile, even the callee-save registers. |
| 3031 | // When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic |
| 3032 | // to cause the upper half to be saved. It will first attempt to find another, unused, callee-save |
| 3033 | // register. If such a register cannot be found, it will save it to an available caller-save register. |
| 3034 | // In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte |
| 3035 | // value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte |
| 3036 | // value will be spilled to the stack.) |
| 3037 | // |
| 3038 | void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode) |
| 3039 | { |
| 3040 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave); |
| 3041 | |
| 3042 | GenTree* op1 = simdNode->gtGetOp1(); |
| 3043 | assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); |
| 3044 | regNumber targetReg = simdNode->gtRegNum; |
| 3045 | regNumber op1Reg = genConsumeReg(op1); |
| 3046 | assert(op1Reg != REG_NA); |
| 3047 | assert(targetReg != REG_NA); |
| 3048 | getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01); |
| 3049 | |
| 3050 | genProduceReg(simdNode); |
| 3051 | } |
| 3052 | |
| 3053 | //----------------------------------------------------------------------------- |
| 3054 | // genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to |
| 3055 | // the given register, if any, or to memory. |
| 3056 | // |
| 3057 | // Arguments: |
| 3058 | // simdNode - The GT_SIMD node |
| 3059 | // |
| 3060 | // Return Value: |
| 3061 | // None. |
| 3062 | // |
| 3063 | // Notes: |
| 3064 | // For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always |
| 3065 | // have their home register, this node has its targetReg on the lclVar child, and its source |
| 3066 | // on the simdNode. |
| 3067 | // Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled |
| 3068 | // an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike |
| 3069 | // most spill scenarios, the saved tree will be different from the restored tree, but the spill |
| 3070 | // restore logic, which is triggered by the call to genConsumeReg, requires us to provide the |
| 3071 | // spilled tree (saveNode) in order to perform the reload. We can easily find that tree, |
| 3072 | // as it is in the spill descriptor for the register from which it was saved. |
| 3073 | // |
| 3074 | void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode) |
| 3075 | { |
| 3076 | assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore); |
| 3077 | |
| 3078 | GenTree* op1 = simdNode->gtGetOp1(); |
| 3079 | assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); |
| 3080 | regNumber srcReg = simdNode->gtRegNum; |
| 3081 | regNumber lclVarReg = genConsumeReg(op1); |
| 3082 | unsigned varNum = op1->AsLclVarCommon()->gtLclNum; |
| 3083 | assert(lclVarReg != REG_NA); |
| 3084 | assert(srcReg != REG_NA); |
| 3085 | if (simdNode->gtFlags & GTF_SPILLED) |
| 3086 | { |
| 3087 | GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree; |
| 3088 | noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg)); |
| 3089 | genConsumeReg(saveNode); |
| 3090 | } |
| 3091 | getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01); |
| 3092 | } |
| 3093 | |
| 3094 | //------------------------------------------------------------------------ |
| 3095 | // genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main |
| 3096 | // routine which in turn calls appropriate genSIMDIntrinsicXXX() routine. |
| 3097 | // |
| 3098 | // Arguments: |
| 3099 | // simdNode - The GT_SIMD node |
| 3100 | // |
| 3101 | // Return Value: |
| 3102 | // None. |
| 3103 | // |
| 3104 | // Notes: |
| 3105 | // Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and |
| 3106 | // a limited set of methods. |
| 3107 | // |
| 3108 | void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) |
| 3109 | { |
| 3110 | // NYI for unsupported base types |
| 3111 | if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG && |
| 3112 | simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE && |
| 3113 | simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE && |
| 3114 | simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE && |
| 3115 | simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG) |
| 3116 | { |
| 3117 | noway_assert(!"SIMD intrinsic with unsupported base type." ); |
| 3118 | } |
| 3119 | |
| 3120 | switch (simdNode->gtSIMDIntrinsicID) |
| 3121 | { |
| 3122 | case SIMDIntrinsicInit: |
| 3123 | genSIMDIntrinsicInit(simdNode); |
| 3124 | break; |
| 3125 | |
| 3126 | case SIMDIntrinsicInitN: |
| 3127 | genSIMDIntrinsicInitN(simdNode); |
| 3128 | break; |
| 3129 | |
| 3130 | case SIMDIntrinsicSqrt: |
| 3131 | case SIMDIntrinsicCast: |
| 3132 | case SIMDIntrinsicAbs: |
| 3133 | genSIMDIntrinsicUnOp(simdNode); |
| 3134 | break; |
| 3135 | |
| 3136 | case SIMDIntrinsicConvertToSingle: |
| 3137 | case SIMDIntrinsicConvertToInt32: |
| 3138 | genSIMDIntrinsic32BitConvert(simdNode); |
| 3139 | break; |
| 3140 | |
| 3141 | case SIMDIntrinsicConvertToDouble: |
| 3142 | case SIMDIntrinsicConvertToInt64: |
| 3143 | genSIMDIntrinsic64BitConvert(simdNode); |
| 3144 | break; |
| 3145 | |
| 3146 | case SIMDIntrinsicWidenLo: |
| 3147 | case SIMDIntrinsicWidenHi: |
| 3148 | genSIMDIntrinsicWiden(simdNode); |
| 3149 | break; |
| 3150 | |
| 3151 | case SIMDIntrinsicNarrow: |
| 3152 | genSIMDIntrinsicNarrow(simdNode); |
| 3153 | break; |
| 3154 | |
| 3155 | case SIMDIntrinsicAdd: |
| 3156 | case SIMDIntrinsicSub: |
| 3157 | case SIMDIntrinsicMul: |
| 3158 | case SIMDIntrinsicDiv: |
| 3159 | case SIMDIntrinsicBitwiseAnd: |
| 3160 | case SIMDIntrinsicBitwiseAndNot: |
| 3161 | case SIMDIntrinsicBitwiseOr: |
| 3162 | case SIMDIntrinsicBitwiseXor: |
| 3163 | case SIMDIntrinsicMin: |
| 3164 | case SIMDIntrinsicMax: |
| 3165 | genSIMDIntrinsicBinOp(simdNode); |
| 3166 | break; |
| 3167 | |
| 3168 | case SIMDIntrinsicOpEquality: |
| 3169 | case SIMDIntrinsicOpInEquality: |
| 3170 | case SIMDIntrinsicEqual: |
| 3171 | case SIMDIntrinsicLessThan: |
| 3172 | case SIMDIntrinsicGreaterThan: |
| 3173 | case SIMDIntrinsicLessThanOrEqual: |
| 3174 | case SIMDIntrinsicGreaterThanOrEqual: |
| 3175 | genSIMDIntrinsicRelOp(simdNode); |
| 3176 | break; |
| 3177 | |
| 3178 | case SIMDIntrinsicDotProduct: |
| 3179 | genSIMDIntrinsicDotProduct(simdNode); |
| 3180 | break; |
| 3181 | |
| 3182 | case SIMDIntrinsicGetItem: |
| 3183 | genSIMDIntrinsicGetItem(simdNode); |
| 3184 | break; |
| 3185 | |
| 3186 | case SIMDIntrinsicShuffleSSE2: |
| 3187 | genSIMDIntrinsicShuffleSSE2(simdNode); |
| 3188 | break; |
| 3189 | |
| 3190 | case SIMDIntrinsicSetX: |
| 3191 | case SIMDIntrinsicSetY: |
| 3192 | case SIMDIntrinsicSetZ: |
| 3193 | case SIMDIntrinsicSetW: |
| 3194 | genSIMDIntrinsicSetItem(simdNode); |
| 3195 | break; |
| 3196 | |
| 3197 | case SIMDIntrinsicUpperSave: |
| 3198 | genSIMDIntrinsicUpperSave(simdNode); |
| 3199 | break; |
| 3200 | case SIMDIntrinsicUpperRestore: |
| 3201 | genSIMDIntrinsicUpperRestore(simdNode); |
| 3202 | break; |
| 3203 | |
| 3204 | default: |
| 3205 | noway_assert(!"Unimplemented SIMD intrinsic." ); |
| 3206 | unreached(); |
| 3207 | } |
| 3208 | } |
| 3209 | |
| 3210 | #endif // FEATURE_SIMD |
| 3211 | #endif //_TARGET_XARCH_ |
| 3212 | |