| 1 | /* |
| 2 | * Copyright (c) 2016, Intel Corporation. |
| 3 | * Intel Math Library (LIBM) Source Code |
| 4 | * |
| 5 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 6 | * |
| 7 | * This code is free software; you can redistribute it and/or modify it |
| 8 | * under the terms of the GNU General Public License version 2 only, as |
| 9 | * published by the Free Software Foundation. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 22 | * or visit www.oracle.com if you need additional information or have any |
| 23 | * questions. |
| 24 | * |
| 25 | */ |
| 26 | |
| 27 | #include "precompiled.hpp" |
| 28 | #include "asm/assembler.hpp" |
| 29 | #include "asm/assembler.inline.hpp" |
| 30 | #include "macroAssembler_x86.hpp" |
| 31 | #include "runtime/stubRoutines.hpp" |
| 32 | #include "utilities/globalDefinitions.hpp" |
| 33 | |
| 34 | /******************************************************************************/ |
| 35 | // ALGORITHM DESCRIPTION - COS() |
| 36 | // --------------------- |
| 37 | // |
| 38 | // 1. RANGE REDUCTION |
| 39 | // |
| 40 | // We perform an initial range reduction from X to r with |
| 41 | // |
| 42 | // X =~= N * pi/32 + r |
| 43 | // |
| 44 | // so that |r| <= pi/64 + epsilon. We restrict inputs to those |
| 45 | // where |N| <= 932560. Beyond this, the range reduction is |
| 46 | // insufficiently accurate. For extremely small inputs, |
| 47 | // denormalization can occur internally, impacting performance. |
| 48 | // This means that the main path is actually only taken for |
| 49 | // 2^-252 <= |X| < 90112. |
| 50 | // |
| 51 | // To avoid branches, we perform the range reduction to full |
| 52 | // accuracy each time. |
| 53 | // |
| 54 | // X - N * (P_1 + P_2 + P_3) |
| 55 | // |
| 56 | // where P_1 and P_2 are 32-bit numbers (so multiplication by N |
| 57 | // is exact) and P_3 is a 53-bit number. Together, these |
| 58 | // approximate pi well enough for all cases in the restricted |
| 59 | // range. |
| 60 | // |
| 61 | // The main reduction sequence is: |
| 62 | // |
| 63 | // y = 32/pi * x |
| 64 | // N = integer(y) |
| 65 | // (computed by adding and subtracting off SHIFTER) |
| 66 | // |
| 67 | // m_1 = N * P_1 |
| 68 | // m_2 = N * P_2 |
| 69 | // r_1 = x - m_1 |
| 70 | // r = r_1 - m_2 |
| 71 | // (this r can be used for most of the calculation) |
| 72 | // |
| 73 | // c_1 = r_1 - r |
| 74 | // m_3 = N * P_3 |
| 75 | // c_2 = c_1 - m_2 |
| 76 | // c = c_2 - m_3 |
| 77 | // |
| 78 | // 2. MAIN ALGORITHM |
| 79 | // |
| 80 | // The algorithm uses a table lookup based on B = M * pi / 32 |
| 81 | // where M = N mod 64. The stored values are: |
| 82 | // sigma closest power of 2 to cos(B) |
| 83 | // C_hl 53-bit cos(B) - sigma |
| 84 | // S_hi + S_lo 2 * 53-bit sin(B) |
| 85 | // |
| 86 | // The computation is organized as follows: |
| 87 | // |
| 88 | // sin(B + r + c) = [sin(B) + sigma * r] + |
| 89 | // r * (cos(B) - sigma) + |
| 90 | // sin(B) * [cos(r + c) - 1] + |
| 91 | // cos(B) * [sin(r + c) - r] |
| 92 | // |
| 93 | // which is approximately: |
| 94 | // |
| 95 | // [S_hi + sigma * r] + |
| 96 | // C_hl * r + |
| 97 | // S_lo + S_hi * [(cos(r) - 1) - r * c] + |
| 98 | // (C_hl + sigma) * [(sin(r) - r) + c] |
| 99 | // |
| 100 | // and this is what is actually computed. We separate this sum |
| 101 | // into four parts: |
| 102 | // |
| 103 | // hi + med + pols + corr |
| 104 | // |
| 105 | // where |
| 106 | // |
| 107 | // hi = S_hi + sigma r |
| 108 | // med = C_hl * r |
| 109 | // pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) |
| 110 | // corr = S_lo + c * ((C_hl + sigma) - S_hi * r) |
| 111 | // |
| 112 | // 3. POLYNOMIAL |
| 113 | // |
| 114 | // The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * |
| 115 | // (sin(r) - r) can be rearranged freely, since it is quite |
| 116 | // small, so we exploit parallelism to the fullest. |
| 117 | // |
| 118 | // psc4 = SC_4 * r_1 |
| 119 | // msc4 = psc4 * r |
| 120 | // r2 = r * r |
| 121 | // msc2 = SC_2 * r2 |
| 122 | // r4 = r2 * r2 |
| 123 | // psc3 = SC_3 + msc4 |
| 124 | // psc1 = SC_1 + msc2 |
| 125 | // msc3 = r4 * psc3 |
| 126 | // sincospols = psc1 + msc3 |
| 127 | // pols = sincospols * |
| 128 | // <S_hi * r^2 | (C_hl + sigma) * r^3> |
| 129 | // |
| 130 | // 4. CORRECTION TERM |
| 131 | // |
| 132 | // This is where the "c" component of the range reduction is |
| 133 | // taken into account; recall that just "r" is used for most of |
| 134 | // the calculation. |
| 135 | // |
| 136 | // -c = m_3 - c_2 |
| 137 | // -d = S_hi * r - (C_hl + sigma) |
| 138 | // corr = -c * -d + S_lo |
| 139 | // |
| 140 | // 5. COMPENSATED SUMMATIONS |
| 141 | // |
| 142 | // The two successive compensated summations add up the high |
| 143 | // and medium parts, leaving just the low parts to add up at |
| 144 | // the end. |
| 145 | // |
| 146 | // rs = sigma * r |
| 147 | // res_int = S_hi + rs |
| 148 | // k_0 = S_hi - res_int |
| 149 | // k_2 = k_0 + rs |
| 150 | // med = C_hl * r |
| 151 | // res_hi = res_int + med |
| 152 | // k_1 = res_int - res_hi |
| 153 | // k_3 = k_1 + med |
| 154 | // |
| 155 | // 6. FINAL SUMMATION |
| 156 | // |
| 157 | // We now add up all the small parts: |
| 158 | // |
| 159 | // res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 |
| 160 | // |
| 161 | // Now the overall result is just: |
| 162 | // |
| 163 | // res_hi + res_lo |
| 164 | // |
| 165 | // 7. SMALL ARGUMENTS |
| 166 | // |
| 167 | // Inputs with |X| < 2^-252 are treated specially as |
| 168 | // 1 - |x|. |
| 169 | // |
| 170 | // Special cases: |
| 171 | // cos(NaN) = quiet NaN, and raise invalid exception |
| 172 | // cos(INF) = NaN and raise invalid exception |
| 173 | // cos(0) = 1 |
| 174 | // |
| 175 | /******************************************************************************/ |
| 176 | |
| 177 | #ifdef _LP64 |
| 178 | // The 64 bit code is at most SSE2 compliant |
| 179 | ATTRIBUTE_ALIGNED(8) juint _ONE[] = |
| 180 | { |
| 181 | 0x00000000UL, 0x3ff00000UL |
| 182 | }; |
| 183 | void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) { |
| 184 | |
| 185 | Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1; |
| 186 | Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1; |
| 187 | Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1; |
| 188 | Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4, start; |
| 189 | |
| 190 | assert_different_registers(r8, r9, r10, r11, eax, ecx, edx); |
| 191 | |
| 192 | address ONEHALF = StubRoutines::x86::_ONEHALF_addr(); |
| 193 | address P_2 = StubRoutines::x86::_P_2_addr(); |
| 194 | address SC_4 = StubRoutines::x86::_SC_4_addr(); |
| 195 | address Ctable = StubRoutines::x86::_Ctable_addr(); |
| 196 | address SC_2 = StubRoutines::x86::_SC_2_addr(); |
| 197 | address SC_3 = StubRoutines::x86::_SC_3_addr(); |
| 198 | address SC_1 = StubRoutines::x86::_SC_1_addr(); |
| 199 | address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr(); |
| 200 | address PI_4 = (address)StubRoutines::x86::_PI_4_addr(); |
| 201 | address PI32INV = (address)StubRoutines::x86::_PI32INV_addr(); |
| 202 | address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr(); |
| 203 | address P_1 = (address)StubRoutines::x86::_P_1_addr(); |
| 204 | address P_3 = (address)StubRoutines::x86::_P_3_addr(); |
| 205 | address ONE = (address)_ONE; |
| 206 | address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr(); |
| 207 | |
| 208 | bind(start); |
| 209 | push(rbx); |
| 210 | subq(rsp, 16); |
| 211 | movsd(Address(rsp, 8), xmm0); |
| 212 | |
| 213 | bind(B1_2); |
| 214 | movl(eax, Address(rsp, 12)); |
| 215 | movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL |
| 216 | andl(eax, 2147418112); |
| 217 | subl(eax, 808452096); |
| 218 | cmpl(eax, 281346048); |
| 219 | jcc(Assembler::above, L_2TAG_PACKET_0_0_1); |
| 220 | mulsd(xmm1, xmm0); |
| 221 | movdqu(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
| 222 | movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL |
| 223 | pand(xmm4, xmm0); |
| 224 | por(xmm5, xmm4); |
| 225 | addpd(xmm1, xmm5); |
| 226 | cvttsd2sil(edx, xmm1); |
| 227 | cvtsi2sdl(xmm1, edx); |
| 228 | movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL |
| 229 | movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL |
| 230 | mulsd(xmm3, xmm1); |
| 231 | unpcklpd(xmm1, xmm1); |
| 232 | addq(rdx, 1865232); |
| 233 | movdqu(xmm4, xmm0); |
| 234 | andq(rdx, 63); |
| 235 | movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL |
| 236 | lea(rax, ExternalAddress(Ctable)); |
| 237 | shlq(rdx, 5); |
| 238 | addq(rax, rdx); |
| 239 | mulpd(xmm2, xmm1); |
| 240 | subsd(xmm0, xmm3); |
| 241 | mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL |
| 242 | subsd(xmm4, xmm3); |
| 243 | movq(xmm7, Address(rax, 8)); |
| 244 | unpcklpd(xmm0, xmm0); |
| 245 | movdqu(xmm3, xmm4); |
| 246 | subsd(xmm4, xmm2); |
| 247 | mulpd(xmm5, xmm0); |
| 248 | subpd(xmm0, xmm2); |
| 249 | movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL |
| 250 | mulsd(xmm7, xmm4); |
| 251 | subsd(xmm3, xmm4); |
| 252 | mulpd(xmm5, xmm0); |
| 253 | mulpd(xmm0, xmm0); |
| 254 | subsd(xmm3, xmm2); |
| 255 | movdqu(xmm2, Address(rax, 0)); |
| 256 | subsd(xmm1, xmm3); |
| 257 | movq(xmm3, Address(rax, 24)); |
| 258 | addsd(xmm2, xmm3); |
| 259 | subsd(xmm7, xmm2); |
| 260 | mulsd(xmm2, xmm4); |
| 261 | mulpd(xmm6, xmm0); |
| 262 | mulsd(xmm3, xmm4); |
| 263 | mulpd(xmm2, xmm0); |
| 264 | mulpd(xmm0, xmm0); |
| 265 | addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL |
| 266 | mulsd(xmm4, Address(rax, 0)); |
| 267 | addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL |
| 268 | mulpd(xmm5, xmm0); |
| 269 | movdqu(xmm0, xmm3); |
| 270 | addsd(xmm3, Address(rax, 8)); |
| 271 | mulpd(xmm1, xmm7); |
| 272 | movdqu(xmm7, xmm4); |
| 273 | addsd(xmm4, xmm3); |
| 274 | addpd(xmm6, xmm5); |
| 275 | movq(xmm5, Address(rax, 8)); |
| 276 | subsd(xmm5, xmm3); |
| 277 | subsd(xmm3, xmm4); |
| 278 | addsd(xmm1, Address(rax, 16)); |
| 279 | mulpd(xmm6, xmm2); |
| 280 | addsd(xmm0, xmm5); |
| 281 | addsd(xmm3, xmm7); |
| 282 | addsd(xmm0, xmm1); |
| 283 | addsd(xmm0, xmm3); |
| 284 | addsd(xmm0, xmm6); |
| 285 | unpckhpd(xmm6, xmm6); |
| 286 | addsd(xmm0, xmm6); |
| 287 | addsd(xmm0, xmm4); |
| 288 | jmp(B1_4); |
| 289 | |
| 290 | bind(L_2TAG_PACKET_0_0_1); |
| 291 | jcc(Assembler::greater, L_2TAG_PACKET_1_0_1); |
| 292 | pextrw(eax, xmm0, 3); |
| 293 | andl(eax, 32767); |
| 294 | pinsrw(xmm0, eax, 3); |
| 295 | movq(xmm1, ExternalAddress(ONE)); //0x00000000UL, 0x3ff00000UL |
| 296 | subsd(xmm1, xmm0); |
| 297 | movdqu(xmm0, xmm1); |
| 298 | jmp(B1_4); |
| 299 | |
| 300 | bind(L_2TAG_PACKET_1_0_1); |
| 301 | pextrw(eax, xmm0, 3); |
| 302 | andl(eax, 32752); |
| 303 | cmpl(eax, 32752); |
| 304 | jcc(Assembler::equal, L_2TAG_PACKET_2_0_1); |
| 305 | pextrw(ecx, xmm0, 3); |
| 306 | andl(ecx, 32752); |
| 307 | subl(ecx, 16224); |
| 308 | shrl(ecx, 7); |
| 309 | andl(ecx, 65532); |
| 310 | lea(r11, ExternalAddress(PI_INV_TABLE)); |
| 311 | addq(rcx, r11); |
| 312 | movdq(rax, xmm0); |
| 313 | movl(r10, Address(rcx, 20)); |
| 314 | movl(r8, Address(rcx, 24)); |
| 315 | movl(edx, eax); |
| 316 | shrq(rax, 21); |
| 317 | orl(eax, INT_MIN); |
| 318 | shrl(eax, 11); |
| 319 | movl(r9, r10); |
| 320 | imulq(r10, rdx); |
| 321 | imulq(r9, rax); |
| 322 | imulq(r8, rax); |
| 323 | movl(rsi, Address(rcx, 16)); |
| 324 | movl(rdi, Address(rcx, 12)); |
| 325 | movl(r11, r10); |
| 326 | shrq(r10, 32); |
| 327 | addq(r9, r10); |
| 328 | addq(r11, r8); |
| 329 | movl(r8, r11); |
| 330 | shrq(r11, 32); |
| 331 | addq(r9, r11); |
| 332 | movl(r10, rsi); |
| 333 | imulq(rsi, rdx); |
| 334 | imulq(r10, rax); |
| 335 | movl(r11, rdi); |
| 336 | imulq(rdi, rdx); |
| 337 | movl(rbx, rsi); |
| 338 | shrq(rsi, 32); |
| 339 | addq(r9, rbx); |
| 340 | movl(rbx, r9); |
| 341 | shrq(r9, 32); |
| 342 | addq(r10, rsi); |
| 343 | addq(r10, r9); |
| 344 | shlq(rbx, 32); |
| 345 | orq(r8, rbx); |
| 346 | imulq(r11, rax); |
| 347 | movl(r9, Address(rcx, 8)); |
| 348 | movl(rsi, Address(rcx, 4)); |
| 349 | movl(rbx, rdi); |
| 350 | shrq(rdi, 32); |
| 351 | addq(r10, rbx); |
| 352 | movl(rbx, r10); |
| 353 | shrq(r10, 32); |
| 354 | addq(r11, rdi); |
| 355 | addq(r11, r10); |
| 356 | movq(rdi, r9); |
| 357 | imulq(r9, rdx); |
| 358 | imulq(rdi, rax); |
| 359 | movl(r10, r9); |
| 360 | shrq(r9, 32); |
| 361 | addq(r11, r10); |
| 362 | movl(r10, r11); |
| 363 | shrq(r11, 32); |
| 364 | addq(rdi, r9); |
| 365 | addq(rdi, r11); |
| 366 | movq(r9, rsi); |
| 367 | imulq(rsi, rdx); |
| 368 | imulq(r9, rax); |
| 369 | shlq(r10, 32); |
| 370 | orq(r10, rbx); |
| 371 | movl(eax, Address(rcx, 0)); |
| 372 | movl(r11, rsi); |
| 373 | shrq(rsi, 32); |
| 374 | addq(rdi, r11); |
| 375 | movl(r11, rdi); |
| 376 | shrq(rdi, 32); |
| 377 | addq(r9, rsi); |
| 378 | addq(r9, rdi); |
| 379 | imulq(rdx, rax); |
| 380 | pextrw(rbx, xmm0, 3); |
| 381 | lea(rdi, ExternalAddress(PI_INV_TABLE)); |
| 382 | subq(rcx, rdi); |
| 383 | addl(ecx, ecx); |
| 384 | addl(ecx, ecx); |
| 385 | addl(ecx, ecx); |
| 386 | addl(ecx, 19); |
| 387 | movl(rsi, 32768); |
| 388 | andl(rsi, rbx); |
| 389 | shrl(rbx, 4); |
| 390 | andl(rbx, 2047); |
| 391 | subl(rbx, 1023); |
| 392 | subl(ecx, rbx); |
| 393 | addq(r9, rdx); |
| 394 | movl(edx, ecx); |
| 395 | addl(edx, 32); |
| 396 | cmpl(ecx, 1); |
| 397 | jcc(Assembler::less, L_2TAG_PACKET_3_0_1); |
| 398 | negl(ecx); |
| 399 | addl(ecx, 29); |
| 400 | shll(r9); |
| 401 | movl(rdi, r9); |
| 402 | andl(r9, 536870911); |
| 403 | testl(r9, 268435456); |
| 404 | jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1); |
| 405 | shrl(r9); |
| 406 | movl(rbx, 0); |
| 407 | shlq(r9, 32); |
| 408 | orq(r9, r11); |
| 409 | |
| 410 | bind(L_2TAG_PACKET_5_0_1); |
| 411 | |
| 412 | bind(L_2TAG_PACKET_6_0_1); |
| 413 | cmpq(r9, 0); |
| 414 | jcc(Assembler::equal, L_2TAG_PACKET_7_0_1); |
| 415 | |
| 416 | bind(L_2TAG_PACKET_8_0_1); |
| 417 | bsrq(r11, r9); |
| 418 | movl(ecx, 29); |
| 419 | subl(ecx, r11); |
| 420 | jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1); |
| 421 | shlq(r9); |
| 422 | movq(rax, r10); |
| 423 | shlq(r10); |
| 424 | addl(edx, ecx); |
| 425 | negl(ecx); |
| 426 | addl(ecx, 64); |
| 427 | shrq(rax); |
| 428 | shrq(r8); |
| 429 | orq(r9, rax); |
| 430 | orq(r10, r8); |
| 431 | |
| 432 | bind(L_2TAG_PACKET_10_0_1); |
| 433 | cvtsi2sdq(xmm0, r9); |
| 434 | shrq(r10, 1); |
| 435 | cvtsi2sdq(xmm3, r10); |
| 436 | xorpd(xmm4, xmm4); |
| 437 | shll(edx, 4); |
| 438 | negl(edx); |
| 439 | addl(edx, 16368); |
| 440 | orl(edx, rsi); |
| 441 | xorl(edx, rbx); |
| 442 | pinsrw(xmm4, edx, 3); |
| 443 | movq(xmm2, ExternalAddress(PI_4)); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL |
| 444 | movq(xmm6, ExternalAddress(8 + PI_4)); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL |
| 445 | xorpd(xmm5, xmm5); |
| 446 | subl(edx, 1008); |
| 447 | pinsrw(xmm5, edx, 3); |
| 448 | mulsd(xmm0, xmm4); |
| 449 | shll(rsi, 16); |
| 450 | sarl(rsi, 31); |
| 451 | mulsd(xmm3, xmm5); |
| 452 | movdqu(xmm1, xmm0); |
| 453 | mulsd(xmm0, xmm2); |
| 454 | shrl(rdi, 29); |
| 455 | addsd(xmm1, xmm3); |
| 456 | mulsd(xmm3, xmm2); |
| 457 | addl(rdi, rsi); |
| 458 | xorl(rdi, rsi); |
| 459 | mulsd(xmm6, xmm1); |
| 460 | movl(eax, rdi); |
| 461 | addsd(xmm6, xmm3); |
| 462 | movdqu(xmm2, xmm0); |
| 463 | addsd(xmm0, xmm6); |
| 464 | subsd(xmm2, xmm0); |
| 465 | addsd(xmm6, xmm2); |
| 466 | |
| 467 | bind(L_2TAG_PACKET_11_0_1); |
| 468 | movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL |
| 469 | mulsd(xmm1, xmm0); |
| 470 | movq(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
| 471 | movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL |
| 472 | pand(xmm4, xmm0); |
| 473 | por(xmm5, xmm4); |
| 474 | addpd(xmm1, xmm5); |
| 475 | cvttsd2siq(rdx, xmm1); |
| 476 | cvtsi2sdq(xmm1, rdx); |
| 477 | movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL |
| 478 | movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL |
| 479 | mulsd(xmm3, xmm1); |
| 480 | unpcklpd(xmm1, xmm1); |
| 481 | shll(eax, 3); |
| 482 | addl(edx, 1865232); |
| 483 | movdqu(xmm4, xmm0); |
| 484 | addl(edx, eax); |
| 485 | andl(edx, 63); |
| 486 | movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL |
| 487 | lea(rax, ExternalAddress(Ctable)); |
| 488 | shll(edx, 5); |
| 489 | addq(rax, rdx); |
| 490 | mulpd(xmm2, xmm1); |
| 491 | subsd(xmm0, xmm3); |
| 492 | mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL |
| 493 | subsd(xmm4, xmm3); |
| 494 | movq(xmm7, Address(rax, 8)); |
| 495 | unpcklpd(xmm0, xmm0); |
| 496 | movdqu(xmm3, xmm4); |
| 497 | subsd(xmm4, xmm2); |
| 498 | mulpd(xmm5, xmm0); |
| 499 | subpd(xmm0, xmm2); |
| 500 | mulsd(xmm7, xmm4); |
| 501 | subsd(xmm3, xmm4); |
| 502 | mulpd(xmm5, xmm0); |
| 503 | mulpd(xmm0, xmm0); |
| 504 | subsd(xmm3, xmm2); |
| 505 | movdqu(xmm2, Address(rax, 0)); |
| 506 | subsd(xmm1, xmm3); |
| 507 | movq(xmm3, Address(rax, 24)); |
| 508 | addsd(xmm2, xmm3); |
| 509 | subsd(xmm7, xmm2); |
| 510 | subsd(xmm1, xmm6); |
| 511 | movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL |
| 512 | mulsd(xmm2, xmm4); |
| 513 | mulpd(xmm6, xmm0); |
| 514 | mulsd(xmm3, xmm4); |
| 515 | mulpd(xmm2, xmm0); |
| 516 | mulpd(xmm0, xmm0); |
| 517 | addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL |
| 518 | mulsd(xmm4, Address(rax, 0)); |
| 519 | addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL |
| 520 | mulpd(xmm5, xmm0); |
| 521 | movdqu(xmm0, xmm3); |
| 522 | addsd(xmm3, Address(rax, 8)); |
| 523 | mulpd(xmm1, xmm7); |
| 524 | movdqu(xmm7, xmm4); |
| 525 | addsd(xmm4, xmm3); |
| 526 | addpd(xmm6, xmm5); |
| 527 | movq(xmm5, Address(rax, 8)); |
| 528 | subsd(xmm5, xmm3); |
| 529 | subsd(xmm3, xmm4); |
| 530 | addsd(xmm1, Address(rax, 16)); |
| 531 | mulpd(xmm6, xmm2); |
| 532 | addsd(xmm5, xmm0); |
| 533 | addsd(xmm3, xmm7); |
| 534 | addsd(xmm1, xmm5); |
| 535 | addsd(xmm1, xmm3); |
| 536 | addsd(xmm1, xmm6); |
| 537 | unpckhpd(xmm6, xmm6); |
| 538 | movdqu(xmm0, xmm4); |
| 539 | addsd(xmm1, xmm6); |
| 540 | addsd(xmm0, xmm1); |
| 541 | jmp(B1_4); |
| 542 | |
| 543 | bind(L_2TAG_PACKET_7_0_1); |
| 544 | addl(edx, 64); |
| 545 | movq(r9, r10); |
| 546 | movq(r10, r8); |
| 547 | movl(r8, 0); |
| 548 | cmpq(r9, 0); |
| 549 | jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1); |
| 550 | addl(edx, 64); |
| 551 | movq(r9, r10); |
| 552 | movq(r10, r8); |
| 553 | cmpq(r9, 0); |
| 554 | jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1); |
| 555 | xorpd(xmm0, xmm0); |
| 556 | xorpd(xmm6, xmm6); |
| 557 | jmp(L_2TAG_PACKET_11_0_1); |
| 558 | |
| 559 | bind(L_2TAG_PACKET_9_0_1); |
| 560 | jcc(Assembler::equal, L_2TAG_PACKET_10_0_1); |
| 561 | negl(ecx); |
| 562 | shrq(r10); |
| 563 | movq(rax, r9); |
| 564 | shrq(r9); |
| 565 | subl(edx, ecx); |
| 566 | negl(ecx); |
| 567 | addl(ecx, 64); |
| 568 | shlq(rax); |
| 569 | orq(r10, rax); |
| 570 | jmp(L_2TAG_PACKET_10_0_1); |
| 571 | bind(L_2TAG_PACKET_3_0_1); |
| 572 | negl(ecx); |
| 573 | shlq(r9, 32); |
| 574 | orq(r9, r11); |
| 575 | shlq(r9); |
| 576 | movq(rdi, r9); |
| 577 | testl(r9, INT_MIN); |
| 578 | jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1); |
| 579 | shrl(r9); |
| 580 | movl(rbx, 0); |
| 581 | shrq(rdi, 3); |
| 582 | jmp(L_2TAG_PACKET_6_0_1); |
| 583 | |
| 584 | bind(L_2TAG_PACKET_4_0_1); |
| 585 | shrl(r9); |
| 586 | movl(rbx, 536870912); |
| 587 | shrl(rbx); |
| 588 | shlq(r9, 32); |
| 589 | orq(r9, r11); |
| 590 | shlq(rbx, 32); |
| 591 | addl(rdi, 536870912); |
| 592 | movl(rcx, 0); |
| 593 | movl(r11, 0); |
| 594 | subq(rcx, r8); |
| 595 | sbbq(r11, r10); |
| 596 | sbbq(rbx, r9); |
| 597 | movq(r8, rcx); |
| 598 | movq(r10, r11); |
| 599 | movq(r9, rbx); |
| 600 | movl(rbx, 32768); |
| 601 | jmp(L_2TAG_PACKET_5_0_1); |
| 602 | |
| 603 | bind(L_2TAG_PACKET_12_0_1); |
| 604 | shrl(r9); |
| 605 | mov64(rbx, 0x100000000); |
| 606 | shrq(rbx); |
| 607 | movl(rcx, 0); |
| 608 | movl(r11, 0); |
| 609 | subq(rcx, r8); |
| 610 | sbbq(r11, r10); |
| 611 | sbbq(rbx, r9); |
| 612 | movq(r8, rcx); |
| 613 | movq(r10, r11); |
| 614 | movq(r9, rbx); |
| 615 | movl(rbx, 32768); |
| 616 | shrq(rdi, 3); |
| 617 | addl(rdi, 536870912); |
| 618 | jmp(L_2TAG_PACKET_6_0_1); |
| 619 | |
| 620 | bind(L_2TAG_PACKET_2_0_1); |
| 621 | movsd(xmm0, Address(rsp, 8)); |
| 622 | mulsd(xmm0, ExternalAddress(NEG_ZERO)); //0x00000000UL, 0x80000000UL |
| 623 | movq(Address(rsp, 0), xmm0); |
| 624 | |
| 625 | bind(L_2TAG_PACKET_13_0_1); |
| 626 | |
| 627 | bind(B1_4); |
| 628 | addq(rsp, 16); |
| 629 | pop(rbx); |
| 630 | } |
| 631 | #else |
| 632 | // The 32 bit code is at most SSE2 compliant |
| 633 | |
| 634 | ATTRIBUTE_ALIGNED(16) juint _static_const_table_cos[] = |
| 635 | { |
| 636 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
| 637 | 0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL, |
| 638 | 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL, |
| 639 | 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, |
| 640 | 0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, |
| 641 | 0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, |
| 642 | 0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, |
| 643 | 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL, |
| 644 | 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL, |
| 645 | 0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL, |
| 646 | 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL, |
| 647 | 0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, |
| 648 | 0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL, |
| 649 | 0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, |
| 650 | 0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, |
| 651 | 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL, |
| 652 | 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL, |
| 653 | 0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL, |
| 654 | 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL, |
| 655 | 0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL, |
| 656 | 0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL, |
| 657 | 0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, |
| 658 | 0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, |
| 659 | 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL, |
| 660 | 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL, |
| 661 | 0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL, |
| 662 | 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
| 663 | 0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, |
| 664 | 0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL, |
| 665 | 0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, |
| 666 | 0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, |
| 667 | 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL, |
| 668 | 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL, |
| 669 | 0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL, |
| 670 | 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL, |
| 671 | 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, |
| 672 | 0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, |
| 673 | 0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, |
| 674 | 0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, |
| 675 | 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL, |
| 676 | 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL, |
| 677 | 0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL, |
| 678 | 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL, |
| 679 | 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, |
| 680 | 0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, |
| 681 | 0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, |
| 682 | 0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, |
| 683 | 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL, |
| 684 | 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL, |
| 685 | 0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL, |
| 686 | 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL, |
| 687 | 0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
| 688 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, |
| 689 | 0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL, |
| 690 | 0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, |
| 691 | 0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL, |
| 692 | 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, |
| 693 | 0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL, |
| 694 | 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL, |
| 695 | 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL, |
| 696 | 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, |
| 697 | 0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL, |
| 698 | 0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, |
| 699 | 0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL, |
| 700 | 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, |
| 701 | 0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL, |
| 702 | 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL, |
| 703 | 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL, |
| 704 | 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, |
| 705 | 0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL, |
| 706 | 0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, |
| 707 | 0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL, |
| 708 | 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, |
| 709 | 0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL, |
| 710 | 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL, |
| 711 | 0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL, |
| 712 | 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL, |
| 713 | 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL, |
| 714 | 0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, |
| 715 | 0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL, |
| 716 | 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, |
| 717 | 0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL, |
| 718 | 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL, |
| 719 | 0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL, |
| 720 | 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, |
| 721 | 0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL, |
| 722 | 0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, |
| 723 | 0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL, |
| 724 | 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, |
| 725 | 0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL, |
| 726 | 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL, |
| 727 | 0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL, |
| 728 | 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, |
| 729 | 0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL, |
| 730 | 0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, |
| 731 | 0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL, |
| 732 | 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, |
| 733 | 0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL, |
| 734 | 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL, |
| 735 | 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL, |
| 736 | 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, |
| 737 | 0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL, |
| 738 | 0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL, |
| 739 | 0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL, |
| 740 | 0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL, |
| 741 | 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL, |
| 742 | 0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL, |
| 743 | 0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL, |
| 744 | 0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
| 745 | 0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL, |
| 746 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, |
| 747 | 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL, |
| 748 | 0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
| 749 | }; |
| 750 | //registers, |
| 751 | // input: (rbp + 8) |
| 752 | // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 |
| 753 | // rax, rdx, rcx, rbx (tmp) |
| 754 | |
| 755 | // Code generated by Intel C compiler for LIBM library |
| 756 | |
| 757 | void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) { |
| 758 | Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; |
| 759 | Label start; |
| 760 | |
| 761 | assert_different_registers(tmp, eax, ecx, edx); |
| 762 | |
| 763 | address static_const_table_cos = (address)_static_const_table_cos; |
| 764 | |
| 765 | bind(start); |
| 766 | subl(rsp, 120); |
| 767 | movl(Address(rsp, 56), tmp); |
| 768 | lea(tmp, ExternalAddress(static_const_table_cos)); |
| 769 | movsd(xmm0, Address(rsp, 128)); |
| 770 | pextrw(eax, xmm0, 3); |
| 771 | andl(eax, 32767); |
| 772 | subl(eax, 12336); |
| 773 | cmpl(eax, 4293); |
| 774 | jcc(Assembler::above, L_2TAG_PACKET_0_0_2); |
| 775 | movsd(xmm1, Address(tmp, 2160)); |
| 776 | mulsd(xmm1, xmm0); |
| 777 | movdqu(xmm5, Address(tmp, 2240)); |
| 778 | movsd(xmm4, Address(tmp, 2224)); |
| 779 | pand(xmm4, xmm0); |
| 780 | por(xmm5, xmm4); |
| 781 | movsd(xmm3, Address(tmp, 2128)); |
| 782 | movdqu(xmm2, Address(tmp, 2112)); |
| 783 | addpd(xmm1, xmm5); |
| 784 | cvttsd2sil(edx, xmm1); |
| 785 | cvtsi2sdl(xmm1, edx); |
| 786 | mulsd(xmm3, xmm1); |
| 787 | unpcklpd(xmm1, xmm1); |
| 788 | addl(edx, 1865232); |
| 789 | movdqu(xmm4, xmm0); |
| 790 | andl(edx, 63); |
| 791 | movdqu(xmm5, Address(tmp, 2096)); |
| 792 | lea(eax, Address(tmp, 0)); |
| 793 | shll(edx, 5); |
| 794 | addl(eax, edx); |
| 795 | mulpd(xmm2, xmm1); |
| 796 | subsd(xmm0, xmm3); |
| 797 | mulsd(xmm1, Address(tmp, 2144)); |
| 798 | subsd(xmm4, xmm3); |
| 799 | movsd(xmm7, Address(eax, 8)); |
| 800 | unpcklpd(xmm0, xmm0); |
| 801 | movapd(xmm3, xmm4); |
| 802 | subsd(xmm4, xmm2); |
| 803 | mulpd(xmm5, xmm0); |
| 804 | subpd(xmm0, xmm2); |
| 805 | movdqu(xmm6, Address(tmp, 2064)); |
| 806 | mulsd(xmm7, xmm4); |
| 807 | subsd(xmm3, xmm4); |
| 808 | mulpd(xmm5, xmm0); |
| 809 | mulpd(xmm0, xmm0); |
| 810 | subsd(xmm3, xmm2); |
| 811 | movdqu(xmm2, Address(eax, 0)); |
| 812 | subsd(xmm1, xmm3); |
| 813 | movsd(xmm3, Address(eax, 24)); |
| 814 | addsd(xmm2, xmm3); |
| 815 | subsd(xmm7, xmm2); |
| 816 | mulsd(xmm2, xmm4); |
| 817 | mulpd(xmm6, xmm0); |
| 818 | mulsd(xmm3, xmm4); |
| 819 | mulpd(xmm2, xmm0); |
| 820 | mulpd(xmm0, xmm0); |
| 821 | addpd(xmm5, Address(tmp, 2080)); |
| 822 | mulsd(xmm4, Address(eax, 0)); |
| 823 | addpd(xmm6, Address(tmp, 2048)); |
| 824 | mulpd(xmm5, xmm0); |
| 825 | movapd(xmm0, xmm3); |
| 826 | addsd(xmm3, Address(eax, 8)); |
| 827 | mulpd(xmm1, xmm7); |
| 828 | movapd(xmm7, xmm4); |
| 829 | addsd(xmm4, xmm3); |
| 830 | addpd(xmm6, xmm5); |
| 831 | movsd(xmm5, Address(eax, 8)); |
| 832 | subsd(xmm5, xmm3); |
| 833 | subsd(xmm3, xmm4); |
| 834 | addsd(xmm1, Address(eax, 16)); |
| 835 | mulpd(xmm6, xmm2); |
| 836 | addsd(xmm5, xmm0); |
| 837 | addsd(xmm3, xmm7); |
| 838 | addsd(xmm1, xmm5); |
| 839 | addsd(xmm1, xmm3); |
| 840 | addsd(xmm1, xmm6); |
| 841 | unpckhpd(xmm6, xmm6); |
| 842 | addsd(xmm1, xmm6); |
| 843 | addsd(xmm4, xmm1); |
| 844 | movsd(Address(rsp, 0), xmm4); |
| 845 | fld_d(Address(rsp, 0)); |
| 846 | jmp(L_2TAG_PACKET_1_0_2); |
| 847 | |
| 848 | bind(L_2TAG_PACKET_0_0_2); |
| 849 | jcc(Assembler::greater, L_2TAG_PACKET_2_0_2); |
| 850 | pextrw(eax, xmm0, 3); |
| 851 | andl(eax, 32767); |
| 852 | pinsrw(xmm0, eax, 3); |
| 853 | movsd(xmm1, Address(tmp, 2192)); |
| 854 | subsd(xmm1, xmm0); |
| 855 | movsd(Address(rsp, 0), xmm1); |
| 856 | fld_d(Address(rsp, 0)); |
| 857 | jmp(L_2TAG_PACKET_1_0_2); |
| 858 | |
| 859 | bind(L_2TAG_PACKET_2_0_2); |
| 860 | movl(eax, Address(rsp, 132)); |
| 861 | andl(eax, 2146435072); |
| 862 | cmpl(eax, 2146435072); |
| 863 | jcc(Assembler::equal, L_2TAG_PACKET_3_0_2); |
| 864 | subl(rsp, 32); |
| 865 | movsd(Address(rsp, 0), xmm0); |
| 866 | lea(eax, Address(rsp, 40)); |
| 867 | movl(Address(rsp, 8), eax); |
| 868 | movl(eax, 1); |
| 869 | movl(Address(rsp, 12), eax); |
| 870 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge()))); |
| 871 | addl(rsp, 32); |
| 872 | fld_d(Address(rsp, 8)); |
| 873 | jmp(L_2TAG_PACKET_1_0_2); |
| 874 | |
| 875 | bind(L_2TAG_PACKET_3_0_2); |
| 876 | fld_d(Address(rsp, 128)); |
| 877 | fmul_d(Address(tmp, 2208)); |
| 878 | |
| 879 | bind(L_2TAG_PACKET_1_0_2); |
| 880 | movl(tmp, Address(rsp, 56)); |
| 881 | } |
| 882 | #endif |
| 883 | |