1 | /* |
2 | * Copyright (c) 2016, Intel Corporation. |
3 | * Intel Math Library (LIBM) Source Code |
4 | * |
5 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
6 | * |
7 | * This code is free software; you can redistribute it and/or modify it |
8 | * under the terms of the GNU General Public License version 2 only, as |
9 | * published by the Free Software Foundation. |
10 | * |
11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
14 | * version 2 for more details (a copy is included in the LICENSE file that |
15 | * accompanied this code). |
16 | * |
17 | * You should have received a copy of the GNU General Public License version |
18 | * 2 along with this work; if not, write to the Free Software Foundation, |
19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
20 | * |
21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 | * or visit www.oracle.com if you need additional information or have any |
23 | * questions. |
24 | * |
25 | */ |
26 | |
27 | #include "precompiled.hpp" |
28 | #include "asm/assembler.hpp" |
29 | #include "asm/assembler.inline.hpp" |
30 | #include "macroAssembler_x86.hpp" |
31 | #include "runtime/stubRoutines.hpp" |
32 | #include "utilities/globalDefinitions.hpp" |
33 | |
34 | /******************************************************************************/ |
35 | // ALGORITHM DESCRIPTION - COS() |
36 | // --------------------- |
37 | // |
38 | // 1. RANGE REDUCTION |
39 | // |
40 | // We perform an initial range reduction from X to r with |
41 | // |
42 | // X =~= N * pi/32 + r |
43 | // |
44 | // so that |r| <= pi/64 + epsilon. We restrict inputs to those |
45 | // where |N| <= 932560. Beyond this, the range reduction is |
46 | // insufficiently accurate. For extremely small inputs, |
47 | // denormalization can occur internally, impacting performance. |
48 | // This means that the main path is actually only taken for |
49 | // 2^-252 <= |X| < 90112. |
50 | // |
51 | // To avoid branches, we perform the range reduction to full |
52 | // accuracy each time. |
53 | // |
54 | // X - N * (P_1 + P_2 + P_3) |
55 | // |
56 | // where P_1 and P_2 are 32-bit numbers (so multiplication by N |
57 | // is exact) and P_3 is a 53-bit number. Together, these |
58 | // approximate pi well enough for all cases in the restricted |
59 | // range. |
60 | // |
61 | // The main reduction sequence is: |
62 | // |
63 | // y = 32/pi * x |
64 | // N = integer(y) |
65 | // (computed by adding and subtracting off SHIFTER) |
66 | // |
67 | // m_1 = N * P_1 |
68 | // m_2 = N * P_2 |
69 | // r_1 = x - m_1 |
70 | // r = r_1 - m_2 |
71 | // (this r can be used for most of the calculation) |
72 | // |
73 | // c_1 = r_1 - r |
74 | // m_3 = N * P_3 |
75 | // c_2 = c_1 - m_2 |
76 | // c = c_2 - m_3 |
77 | // |
78 | // 2. MAIN ALGORITHM |
79 | // |
80 | // The algorithm uses a table lookup based on B = M * pi / 32 |
81 | // where M = N mod 64. The stored values are: |
82 | // sigma closest power of 2 to cos(B) |
83 | // C_hl 53-bit cos(B) - sigma |
84 | // S_hi + S_lo 2 * 53-bit sin(B) |
85 | // |
86 | // The computation is organized as follows: |
87 | // |
88 | // sin(B + r + c) = [sin(B) + sigma * r] + |
89 | // r * (cos(B) - sigma) + |
90 | // sin(B) * [cos(r + c) - 1] + |
91 | // cos(B) * [sin(r + c) - r] |
92 | // |
93 | // which is approximately: |
94 | // |
95 | // [S_hi + sigma * r] + |
96 | // C_hl * r + |
97 | // S_lo + S_hi * [(cos(r) - 1) - r * c] + |
98 | // (C_hl + sigma) * [(sin(r) - r) + c] |
99 | // |
100 | // and this is what is actually computed. We separate this sum |
101 | // into four parts: |
102 | // |
103 | // hi + med + pols + corr |
104 | // |
105 | // where |
106 | // |
107 | // hi = S_hi + sigma r |
108 | // med = C_hl * r |
109 | // pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) |
110 | // corr = S_lo + c * ((C_hl + sigma) - S_hi * r) |
111 | // |
112 | // 3. POLYNOMIAL |
113 | // |
114 | // The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * |
115 | // (sin(r) - r) can be rearranged freely, since it is quite |
116 | // small, so we exploit parallelism to the fullest. |
117 | // |
118 | // psc4 = SC_4 * r_1 |
119 | // msc4 = psc4 * r |
120 | // r2 = r * r |
121 | // msc2 = SC_2 * r2 |
122 | // r4 = r2 * r2 |
123 | // psc3 = SC_3 + msc4 |
124 | // psc1 = SC_1 + msc2 |
125 | // msc3 = r4 * psc3 |
126 | // sincospols = psc1 + msc3 |
127 | // pols = sincospols * |
128 | // <S_hi * r^2 | (C_hl + sigma) * r^3> |
129 | // |
130 | // 4. CORRECTION TERM |
131 | // |
132 | // This is where the "c" component of the range reduction is |
133 | // taken into account; recall that just "r" is used for most of |
134 | // the calculation. |
135 | // |
136 | // -c = m_3 - c_2 |
137 | // -d = S_hi * r - (C_hl + sigma) |
138 | // corr = -c * -d + S_lo |
139 | // |
140 | // 5. COMPENSATED SUMMATIONS |
141 | // |
142 | // The two successive compensated summations add up the high |
143 | // and medium parts, leaving just the low parts to add up at |
144 | // the end. |
145 | // |
146 | // rs = sigma * r |
147 | // res_int = S_hi + rs |
148 | // k_0 = S_hi - res_int |
149 | // k_2 = k_0 + rs |
150 | // med = C_hl * r |
151 | // res_hi = res_int + med |
152 | // k_1 = res_int - res_hi |
153 | // k_3 = k_1 + med |
154 | // |
155 | // 6. FINAL SUMMATION |
156 | // |
157 | // We now add up all the small parts: |
158 | // |
159 | // res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 |
160 | // |
161 | // Now the overall result is just: |
162 | // |
163 | // res_hi + res_lo |
164 | // |
165 | // 7. SMALL ARGUMENTS |
166 | // |
167 | // Inputs with |X| < 2^-252 are treated specially as |
168 | // 1 - |x|. |
169 | // |
170 | // Special cases: |
171 | // cos(NaN) = quiet NaN, and raise invalid exception |
172 | // cos(INF) = NaN and raise invalid exception |
173 | // cos(0) = 1 |
174 | // |
175 | /******************************************************************************/ |
176 | |
177 | #ifdef _LP64 |
178 | // The 64 bit code is at most SSE2 compliant |
179 | ATTRIBUTE_ALIGNED(8) juint _ONE[] = |
180 | { |
181 | 0x00000000UL, 0x3ff00000UL |
182 | }; |
183 | void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) { |
184 | |
185 | Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1; |
186 | Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1; |
187 | Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1; |
188 | Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4, start; |
189 | |
190 | assert_different_registers(r8, r9, r10, r11, eax, ecx, edx); |
191 | |
192 | address ONEHALF = StubRoutines::x86::_ONEHALF_addr(); |
193 | address P_2 = StubRoutines::x86::_P_2_addr(); |
194 | address SC_4 = StubRoutines::x86::_SC_4_addr(); |
195 | address Ctable = StubRoutines::x86::_Ctable_addr(); |
196 | address SC_2 = StubRoutines::x86::_SC_2_addr(); |
197 | address SC_3 = StubRoutines::x86::_SC_3_addr(); |
198 | address SC_1 = StubRoutines::x86::_SC_1_addr(); |
199 | address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr(); |
200 | address PI_4 = (address)StubRoutines::x86::_PI_4_addr(); |
201 | address PI32INV = (address)StubRoutines::x86::_PI32INV_addr(); |
202 | address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr(); |
203 | address P_1 = (address)StubRoutines::x86::_P_1_addr(); |
204 | address P_3 = (address)StubRoutines::x86::_P_3_addr(); |
205 | address ONE = (address)_ONE; |
206 | address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr(); |
207 | |
208 | bind(start); |
209 | push(rbx); |
210 | subq(rsp, 16); |
211 | movsd(Address(rsp, 8), xmm0); |
212 | |
213 | bind(B1_2); |
214 | movl(eax, Address(rsp, 12)); |
215 | movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL |
216 | andl(eax, 2147418112); |
217 | subl(eax, 808452096); |
218 | cmpl(eax, 281346048); |
219 | jcc(Assembler::above, L_2TAG_PACKET_0_0_1); |
220 | mulsd(xmm1, xmm0); |
221 | movdqu(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
222 | movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL |
223 | pand(xmm4, xmm0); |
224 | por(xmm5, xmm4); |
225 | addpd(xmm1, xmm5); |
226 | cvttsd2sil(edx, xmm1); |
227 | cvtsi2sdl(xmm1, edx); |
228 | movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL |
229 | movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL |
230 | mulsd(xmm3, xmm1); |
231 | unpcklpd(xmm1, xmm1); |
232 | addq(rdx, 1865232); |
233 | movdqu(xmm4, xmm0); |
234 | andq(rdx, 63); |
235 | movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL |
236 | lea(rax, ExternalAddress(Ctable)); |
237 | shlq(rdx, 5); |
238 | addq(rax, rdx); |
239 | mulpd(xmm2, xmm1); |
240 | subsd(xmm0, xmm3); |
241 | mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL |
242 | subsd(xmm4, xmm3); |
243 | movq(xmm7, Address(rax, 8)); |
244 | unpcklpd(xmm0, xmm0); |
245 | movdqu(xmm3, xmm4); |
246 | subsd(xmm4, xmm2); |
247 | mulpd(xmm5, xmm0); |
248 | subpd(xmm0, xmm2); |
249 | movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL |
250 | mulsd(xmm7, xmm4); |
251 | subsd(xmm3, xmm4); |
252 | mulpd(xmm5, xmm0); |
253 | mulpd(xmm0, xmm0); |
254 | subsd(xmm3, xmm2); |
255 | movdqu(xmm2, Address(rax, 0)); |
256 | subsd(xmm1, xmm3); |
257 | movq(xmm3, Address(rax, 24)); |
258 | addsd(xmm2, xmm3); |
259 | subsd(xmm7, xmm2); |
260 | mulsd(xmm2, xmm4); |
261 | mulpd(xmm6, xmm0); |
262 | mulsd(xmm3, xmm4); |
263 | mulpd(xmm2, xmm0); |
264 | mulpd(xmm0, xmm0); |
265 | addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL |
266 | mulsd(xmm4, Address(rax, 0)); |
267 | addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL |
268 | mulpd(xmm5, xmm0); |
269 | movdqu(xmm0, xmm3); |
270 | addsd(xmm3, Address(rax, 8)); |
271 | mulpd(xmm1, xmm7); |
272 | movdqu(xmm7, xmm4); |
273 | addsd(xmm4, xmm3); |
274 | addpd(xmm6, xmm5); |
275 | movq(xmm5, Address(rax, 8)); |
276 | subsd(xmm5, xmm3); |
277 | subsd(xmm3, xmm4); |
278 | addsd(xmm1, Address(rax, 16)); |
279 | mulpd(xmm6, xmm2); |
280 | addsd(xmm0, xmm5); |
281 | addsd(xmm3, xmm7); |
282 | addsd(xmm0, xmm1); |
283 | addsd(xmm0, xmm3); |
284 | addsd(xmm0, xmm6); |
285 | unpckhpd(xmm6, xmm6); |
286 | addsd(xmm0, xmm6); |
287 | addsd(xmm0, xmm4); |
288 | jmp(B1_4); |
289 | |
290 | bind(L_2TAG_PACKET_0_0_1); |
291 | jcc(Assembler::greater, L_2TAG_PACKET_1_0_1); |
292 | pextrw(eax, xmm0, 3); |
293 | andl(eax, 32767); |
294 | pinsrw(xmm0, eax, 3); |
295 | movq(xmm1, ExternalAddress(ONE)); //0x00000000UL, 0x3ff00000UL |
296 | subsd(xmm1, xmm0); |
297 | movdqu(xmm0, xmm1); |
298 | jmp(B1_4); |
299 | |
300 | bind(L_2TAG_PACKET_1_0_1); |
301 | pextrw(eax, xmm0, 3); |
302 | andl(eax, 32752); |
303 | cmpl(eax, 32752); |
304 | jcc(Assembler::equal, L_2TAG_PACKET_2_0_1); |
305 | pextrw(ecx, xmm0, 3); |
306 | andl(ecx, 32752); |
307 | subl(ecx, 16224); |
308 | shrl(ecx, 7); |
309 | andl(ecx, 65532); |
310 | lea(r11, ExternalAddress(PI_INV_TABLE)); |
311 | addq(rcx, r11); |
312 | movdq(rax, xmm0); |
313 | movl(r10, Address(rcx, 20)); |
314 | movl(r8, Address(rcx, 24)); |
315 | movl(edx, eax); |
316 | shrq(rax, 21); |
317 | orl(eax, INT_MIN); |
318 | shrl(eax, 11); |
319 | movl(r9, r10); |
320 | imulq(r10, rdx); |
321 | imulq(r9, rax); |
322 | imulq(r8, rax); |
323 | movl(rsi, Address(rcx, 16)); |
324 | movl(rdi, Address(rcx, 12)); |
325 | movl(r11, r10); |
326 | shrq(r10, 32); |
327 | addq(r9, r10); |
328 | addq(r11, r8); |
329 | movl(r8, r11); |
330 | shrq(r11, 32); |
331 | addq(r9, r11); |
332 | movl(r10, rsi); |
333 | imulq(rsi, rdx); |
334 | imulq(r10, rax); |
335 | movl(r11, rdi); |
336 | imulq(rdi, rdx); |
337 | movl(rbx, rsi); |
338 | shrq(rsi, 32); |
339 | addq(r9, rbx); |
340 | movl(rbx, r9); |
341 | shrq(r9, 32); |
342 | addq(r10, rsi); |
343 | addq(r10, r9); |
344 | shlq(rbx, 32); |
345 | orq(r8, rbx); |
346 | imulq(r11, rax); |
347 | movl(r9, Address(rcx, 8)); |
348 | movl(rsi, Address(rcx, 4)); |
349 | movl(rbx, rdi); |
350 | shrq(rdi, 32); |
351 | addq(r10, rbx); |
352 | movl(rbx, r10); |
353 | shrq(r10, 32); |
354 | addq(r11, rdi); |
355 | addq(r11, r10); |
356 | movq(rdi, r9); |
357 | imulq(r9, rdx); |
358 | imulq(rdi, rax); |
359 | movl(r10, r9); |
360 | shrq(r9, 32); |
361 | addq(r11, r10); |
362 | movl(r10, r11); |
363 | shrq(r11, 32); |
364 | addq(rdi, r9); |
365 | addq(rdi, r11); |
366 | movq(r9, rsi); |
367 | imulq(rsi, rdx); |
368 | imulq(r9, rax); |
369 | shlq(r10, 32); |
370 | orq(r10, rbx); |
371 | movl(eax, Address(rcx, 0)); |
372 | movl(r11, rsi); |
373 | shrq(rsi, 32); |
374 | addq(rdi, r11); |
375 | movl(r11, rdi); |
376 | shrq(rdi, 32); |
377 | addq(r9, rsi); |
378 | addq(r9, rdi); |
379 | imulq(rdx, rax); |
380 | pextrw(rbx, xmm0, 3); |
381 | lea(rdi, ExternalAddress(PI_INV_TABLE)); |
382 | subq(rcx, rdi); |
383 | addl(ecx, ecx); |
384 | addl(ecx, ecx); |
385 | addl(ecx, ecx); |
386 | addl(ecx, 19); |
387 | movl(rsi, 32768); |
388 | andl(rsi, rbx); |
389 | shrl(rbx, 4); |
390 | andl(rbx, 2047); |
391 | subl(rbx, 1023); |
392 | subl(ecx, rbx); |
393 | addq(r9, rdx); |
394 | movl(edx, ecx); |
395 | addl(edx, 32); |
396 | cmpl(ecx, 1); |
397 | jcc(Assembler::less, L_2TAG_PACKET_3_0_1); |
398 | negl(ecx); |
399 | addl(ecx, 29); |
400 | shll(r9); |
401 | movl(rdi, r9); |
402 | andl(r9, 536870911); |
403 | testl(r9, 268435456); |
404 | jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1); |
405 | shrl(r9); |
406 | movl(rbx, 0); |
407 | shlq(r9, 32); |
408 | orq(r9, r11); |
409 | |
410 | bind(L_2TAG_PACKET_5_0_1); |
411 | |
412 | bind(L_2TAG_PACKET_6_0_1); |
413 | cmpq(r9, 0); |
414 | jcc(Assembler::equal, L_2TAG_PACKET_7_0_1); |
415 | |
416 | bind(L_2TAG_PACKET_8_0_1); |
417 | bsrq(r11, r9); |
418 | movl(ecx, 29); |
419 | subl(ecx, r11); |
420 | jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1); |
421 | shlq(r9); |
422 | movq(rax, r10); |
423 | shlq(r10); |
424 | addl(edx, ecx); |
425 | negl(ecx); |
426 | addl(ecx, 64); |
427 | shrq(rax); |
428 | shrq(r8); |
429 | orq(r9, rax); |
430 | orq(r10, r8); |
431 | |
432 | bind(L_2TAG_PACKET_10_0_1); |
433 | cvtsi2sdq(xmm0, r9); |
434 | shrq(r10, 1); |
435 | cvtsi2sdq(xmm3, r10); |
436 | xorpd(xmm4, xmm4); |
437 | shll(edx, 4); |
438 | negl(edx); |
439 | addl(edx, 16368); |
440 | orl(edx, rsi); |
441 | xorl(edx, rbx); |
442 | pinsrw(xmm4, edx, 3); |
443 | movq(xmm2, ExternalAddress(PI_4)); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL |
444 | movq(xmm6, ExternalAddress(8 + PI_4)); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL |
445 | xorpd(xmm5, xmm5); |
446 | subl(edx, 1008); |
447 | pinsrw(xmm5, edx, 3); |
448 | mulsd(xmm0, xmm4); |
449 | shll(rsi, 16); |
450 | sarl(rsi, 31); |
451 | mulsd(xmm3, xmm5); |
452 | movdqu(xmm1, xmm0); |
453 | mulsd(xmm0, xmm2); |
454 | shrl(rdi, 29); |
455 | addsd(xmm1, xmm3); |
456 | mulsd(xmm3, xmm2); |
457 | addl(rdi, rsi); |
458 | xorl(rdi, rsi); |
459 | mulsd(xmm6, xmm1); |
460 | movl(eax, rdi); |
461 | addsd(xmm6, xmm3); |
462 | movdqu(xmm2, xmm0); |
463 | addsd(xmm0, xmm6); |
464 | subsd(xmm2, xmm0); |
465 | addsd(xmm6, xmm2); |
466 | |
467 | bind(L_2TAG_PACKET_11_0_1); |
468 | movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL |
469 | mulsd(xmm1, xmm0); |
470 | movq(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
471 | movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL |
472 | pand(xmm4, xmm0); |
473 | por(xmm5, xmm4); |
474 | addpd(xmm1, xmm5); |
475 | cvttsd2siq(rdx, xmm1); |
476 | cvtsi2sdq(xmm1, rdx); |
477 | movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL |
478 | movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL |
479 | mulsd(xmm3, xmm1); |
480 | unpcklpd(xmm1, xmm1); |
481 | shll(eax, 3); |
482 | addl(edx, 1865232); |
483 | movdqu(xmm4, xmm0); |
484 | addl(edx, eax); |
485 | andl(edx, 63); |
486 | movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL |
487 | lea(rax, ExternalAddress(Ctable)); |
488 | shll(edx, 5); |
489 | addq(rax, rdx); |
490 | mulpd(xmm2, xmm1); |
491 | subsd(xmm0, xmm3); |
492 | mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL |
493 | subsd(xmm4, xmm3); |
494 | movq(xmm7, Address(rax, 8)); |
495 | unpcklpd(xmm0, xmm0); |
496 | movdqu(xmm3, xmm4); |
497 | subsd(xmm4, xmm2); |
498 | mulpd(xmm5, xmm0); |
499 | subpd(xmm0, xmm2); |
500 | mulsd(xmm7, xmm4); |
501 | subsd(xmm3, xmm4); |
502 | mulpd(xmm5, xmm0); |
503 | mulpd(xmm0, xmm0); |
504 | subsd(xmm3, xmm2); |
505 | movdqu(xmm2, Address(rax, 0)); |
506 | subsd(xmm1, xmm3); |
507 | movq(xmm3, Address(rax, 24)); |
508 | addsd(xmm2, xmm3); |
509 | subsd(xmm7, xmm2); |
510 | subsd(xmm1, xmm6); |
511 | movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL |
512 | mulsd(xmm2, xmm4); |
513 | mulpd(xmm6, xmm0); |
514 | mulsd(xmm3, xmm4); |
515 | mulpd(xmm2, xmm0); |
516 | mulpd(xmm0, xmm0); |
517 | addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL |
518 | mulsd(xmm4, Address(rax, 0)); |
519 | addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL |
520 | mulpd(xmm5, xmm0); |
521 | movdqu(xmm0, xmm3); |
522 | addsd(xmm3, Address(rax, 8)); |
523 | mulpd(xmm1, xmm7); |
524 | movdqu(xmm7, xmm4); |
525 | addsd(xmm4, xmm3); |
526 | addpd(xmm6, xmm5); |
527 | movq(xmm5, Address(rax, 8)); |
528 | subsd(xmm5, xmm3); |
529 | subsd(xmm3, xmm4); |
530 | addsd(xmm1, Address(rax, 16)); |
531 | mulpd(xmm6, xmm2); |
532 | addsd(xmm5, xmm0); |
533 | addsd(xmm3, xmm7); |
534 | addsd(xmm1, xmm5); |
535 | addsd(xmm1, xmm3); |
536 | addsd(xmm1, xmm6); |
537 | unpckhpd(xmm6, xmm6); |
538 | movdqu(xmm0, xmm4); |
539 | addsd(xmm1, xmm6); |
540 | addsd(xmm0, xmm1); |
541 | jmp(B1_4); |
542 | |
543 | bind(L_2TAG_PACKET_7_0_1); |
544 | addl(edx, 64); |
545 | movq(r9, r10); |
546 | movq(r10, r8); |
547 | movl(r8, 0); |
548 | cmpq(r9, 0); |
549 | jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1); |
550 | addl(edx, 64); |
551 | movq(r9, r10); |
552 | movq(r10, r8); |
553 | cmpq(r9, 0); |
554 | jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1); |
555 | xorpd(xmm0, xmm0); |
556 | xorpd(xmm6, xmm6); |
557 | jmp(L_2TAG_PACKET_11_0_1); |
558 | |
559 | bind(L_2TAG_PACKET_9_0_1); |
560 | jcc(Assembler::equal, L_2TAG_PACKET_10_0_1); |
561 | negl(ecx); |
562 | shrq(r10); |
563 | movq(rax, r9); |
564 | shrq(r9); |
565 | subl(edx, ecx); |
566 | negl(ecx); |
567 | addl(ecx, 64); |
568 | shlq(rax); |
569 | orq(r10, rax); |
570 | jmp(L_2TAG_PACKET_10_0_1); |
571 | bind(L_2TAG_PACKET_3_0_1); |
572 | negl(ecx); |
573 | shlq(r9, 32); |
574 | orq(r9, r11); |
575 | shlq(r9); |
576 | movq(rdi, r9); |
577 | testl(r9, INT_MIN); |
578 | jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1); |
579 | shrl(r9); |
580 | movl(rbx, 0); |
581 | shrq(rdi, 3); |
582 | jmp(L_2TAG_PACKET_6_0_1); |
583 | |
584 | bind(L_2TAG_PACKET_4_0_1); |
585 | shrl(r9); |
586 | movl(rbx, 536870912); |
587 | shrl(rbx); |
588 | shlq(r9, 32); |
589 | orq(r9, r11); |
590 | shlq(rbx, 32); |
591 | addl(rdi, 536870912); |
592 | movl(rcx, 0); |
593 | movl(r11, 0); |
594 | subq(rcx, r8); |
595 | sbbq(r11, r10); |
596 | sbbq(rbx, r9); |
597 | movq(r8, rcx); |
598 | movq(r10, r11); |
599 | movq(r9, rbx); |
600 | movl(rbx, 32768); |
601 | jmp(L_2TAG_PACKET_5_0_1); |
602 | |
603 | bind(L_2TAG_PACKET_12_0_1); |
604 | shrl(r9); |
605 | mov64(rbx, 0x100000000); |
606 | shrq(rbx); |
607 | movl(rcx, 0); |
608 | movl(r11, 0); |
609 | subq(rcx, r8); |
610 | sbbq(r11, r10); |
611 | sbbq(rbx, r9); |
612 | movq(r8, rcx); |
613 | movq(r10, r11); |
614 | movq(r9, rbx); |
615 | movl(rbx, 32768); |
616 | shrq(rdi, 3); |
617 | addl(rdi, 536870912); |
618 | jmp(L_2TAG_PACKET_6_0_1); |
619 | |
620 | bind(L_2TAG_PACKET_2_0_1); |
621 | movsd(xmm0, Address(rsp, 8)); |
622 | mulsd(xmm0, ExternalAddress(NEG_ZERO)); //0x00000000UL, 0x80000000UL |
623 | movq(Address(rsp, 0), xmm0); |
624 | |
625 | bind(L_2TAG_PACKET_13_0_1); |
626 | |
627 | bind(B1_4); |
628 | addq(rsp, 16); |
629 | pop(rbx); |
630 | } |
631 | #else |
632 | // The 32 bit code is at most SSE2 compliant |
633 | |
634 | ATTRIBUTE_ALIGNED(16) juint _static_const_table_cos[] = |
635 | { |
636 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
637 | 0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL, |
638 | 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL, |
639 | 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, |
640 | 0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, |
641 | 0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, |
642 | 0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, |
643 | 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL, |
644 | 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL, |
645 | 0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL, |
646 | 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL, |
647 | 0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, |
648 | 0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL, |
649 | 0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, |
650 | 0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, |
651 | 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL, |
652 | 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL, |
653 | 0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL, |
654 | 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL, |
655 | 0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL, |
656 | 0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL, |
657 | 0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, |
658 | 0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, |
659 | 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL, |
660 | 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL, |
661 | 0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL, |
662 | 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
663 | 0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, |
664 | 0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL, |
665 | 0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, |
666 | 0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, |
667 | 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL, |
668 | 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL, |
669 | 0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL, |
670 | 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL, |
671 | 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, |
672 | 0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, |
673 | 0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, |
674 | 0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, |
675 | 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL, |
676 | 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL, |
677 | 0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL, |
678 | 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL, |
679 | 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, |
680 | 0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, |
681 | 0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, |
682 | 0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, |
683 | 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL, |
684 | 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL, |
685 | 0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL, |
686 | 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL, |
687 | 0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
688 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, |
689 | 0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL, |
690 | 0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, |
691 | 0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL, |
692 | 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, |
693 | 0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL, |
694 | 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL, |
695 | 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL, |
696 | 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, |
697 | 0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL, |
698 | 0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, |
699 | 0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL, |
700 | 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, |
701 | 0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL, |
702 | 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL, |
703 | 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL, |
704 | 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, |
705 | 0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL, |
706 | 0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, |
707 | 0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL, |
708 | 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, |
709 | 0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL, |
710 | 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL, |
711 | 0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL, |
712 | 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL, |
713 | 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL, |
714 | 0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, |
715 | 0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL, |
716 | 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, |
717 | 0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL, |
718 | 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL, |
719 | 0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL, |
720 | 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, |
721 | 0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL, |
722 | 0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, |
723 | 0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL, |
724 | 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, |
725 | 0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL, |
726 | 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL, |
727 | 0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL, |
728 | 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, |
729 | 0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL, |
730 | 0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, |
731 | 0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL, |
732 | 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, |
733 | 0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL, |
734 | 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL, |
735 | 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL, |
736 | 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, |
737 | 0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL, |
738 | 0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL, |
739 | 0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL, |
740 | 0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL, |
741 | 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL, |
742 | 0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL, |
743 | 0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL, |
744 | 0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, |
745 | 0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL, |
746 | 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, |
747 | 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL, |
748 | 0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL |
749 | }; |
750 | //registers, |
751 | // input: (rbp + 8) |
752 | // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 |
753 | // rax, rdx, rcx, rbx (tmp) |
754 | |
755 | // Code generated by Intel C compiler for LIBM library |
756 | |
757 | void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) { |
758 | Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; |
759 | Label start; |
760 | |
761 | assert_different_registers(tmp, eax, ecx, edx); |
762 | |
763 | address static_const_table_cos = (address)_static_const_table_cos; |
764 | |
765 | bind(start); |
766 | subl(rsp, 120); |
767 | movl(Address(rsp, 56), tmp); |
768 | lea(tmp, ExternalAddress(static_const_table_cos)); |
769 | movsd(xmm0, Address(rsp, 128)); |
770 | pextrw(eax, xmm0, 3); |
771 | andl(eax, 32767); |
772 | subl(eax, 12336); |
773 | cmpl(eax, 4293); |
774 | jcc(Assembler::above, L_2TAG_PACKET_0_0_2); |
775 | movsd(xmm1, Address(tmp, 2160)); |
776 | mulsd(xmm1, xmm0); |
777 | movdqu(xmm5, Address(tmp, 2240)); |
778 | movsd(xmm4, Address(tmp, 2224)); |
779 | pand(xmm4, xmm0); |
780 | por(xmm5, xmm4); |
781 | movsd(xmm3, Address(tmp, 2128)); |
782 | movdqu(xmm2, Address(tmp, 2112)); |
783 | addpd(xmm1, xmm5); |
784 | cvttsd2sil(edx, xmm1); |
785 | cvtsi2sdl(xmm1, edx); |
786 | mulsd(xmm3, xmm1); |
787 | unpcklpd(xmm1, xmm1); |
788 | addl(edx, 1865232); |
789 | movdqu(xmm4, xmm0); |
790 | andl(edx, 63); |
791 | movdqu(xmm5, Address(tmp, 2096)); |
792 | lea(eax, Address(tmp, 0)); |
793 | shll(edx, 5); |
794 | addl(eax, edx); |
795 | mulpd(xmm2, xmm1); |
796 | subsd(xmm0, xmm3); |
797 | mulsd(xmm1, Address(tmp, 2144)); |
798 | subsd(xmm4, xmm3); |
799 | movsd(xmm7, Address(eax, 8)); |
800 | unpcklpd(xmm0, xmm0); |
801 | movapd(xmm3, xmm4); |
802 | subsd(xmm4, xmm2); |
803 | mulpd(xmm5, xmm0); |
804 | subpd(xmm0, xmm2); |
805 | movdqu(xmm6, Address(tmp, 2064)); |
806 | mulsd(xmm7, xmm4); |
807 | subsd(xmm3, xmm4); |
808 | mulpd(xmm5, xmm0); |
809 | mulpd(xmm0, xmm0); |
810 | subsd(xmm3, xmm2); |
811 | movdqu(xmm2, Address(eax, 0)); |
812 | subsd(xmm1, xmm3); |
813 | movsd(xmm3, Address(eax, 24)); |
814 | addsd(xmm2, xmm3); |
815 | subsd(xmm7, xmm2); |
816 | mulsd(xmm2, xmm4); |
817 | mulpd(xmm6, xmm0); |
818 | mulsd(xmm3, xmm4); |
819 | mulpd(xmm2, xmm0); |
820 | mulpd(xmm0, xmm0); |
821 | addpd(xmm5, Address(tmp, 2080)); |
822 | mulsd(xmm4, Address(eax, 0)); |
823 | addpd(xmm6, Address(tmp, 2048)); |
824 | mulpd(xmm5, xmm0); |
825 | movapd(xmm0, xmm3); |
826 | addsd(xmm3, Address(eax, 8)); |
827 | mulpd(xmm1, xmm7); |
828 | movapd(xmm7, xmm4); |
829 | addsd(xmm4, xmm3); |
830 | addpd(xmm6, xmm5); |
831 | movsd(xmm5, Address(eax, 8)); |
832 | subsd(xmm5, xmm3); |
833 | subsd(xmm3, xmm4); |
834 | addsd(xmm1, Address(eax, 16)); |
835 | mulpd(xmm6, xmm2); |
836 | addsd(xmm5, xmm0); |
837 | addsd(xmm3, xmm7); |
838 | addsd(xmm1, xmm5); |
839 | addsd(xmm1, xmm3); |
840 | addsd(xmm1, xmm6); |
841 | unpckhpd(xmm6, xmm6); |
842 | addsd(xmm1, xmm6); |
843 | addsd(xmm4, xmm1); |
844 | movsd(Address(rsp, 0), xmm4); |
845 | fld_d(Address(rsp, 0)); |
846 | jmp(L_2TAG_PACKET_1_0_2); |
847 | |
848 | bind(L_2TAG_PACKET_0_0_2); |
849 | jcc(Assembler::greater, L_2TAG_PACKET_2_0_2); |
850 | pextrw(eax, xmm0, 3); |
851 | andl(eax, 32767); |
852 | pinsrw(xmm0, eax, 3); |
853 | movsd(xmm1, Address(tmp, 2192)); |
854 | subsd(xmm1, xmm0); |
855 | movsd(Address(rsp, 0), xmm1); |
856 | fld_d(Address(rsp, 0)); |
857 | jmp(L_2TAG_PACKET_1_0_2); |
858 | |
859 | bind(L_2TAG_PACKET_2_0_2); |
860 | movl(eax, Address(rsp, 132)); |
861 | andl(eax, 2146435072); |
862 | cmpl(eax, 2146435072); |
863 | jcc(Assembler::equal, L_2TAG_PACKET_3_0_2); |
864 | subl(rsp, 32); |
865 | movsd(Address(rsp, 0), xmm0); |
866 | lea(eax, Address(rsp, 40)); |
867 | movl(Address(rsp, 8), eax); |
868 | movl(eax, 1); |
869 | movl(Address(rsp, 12), eax); |
870 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge()))); |
871 | addl(rsp, 32); |
872 | fld_d(Address(rsp, 8)); |
873 | jmp(L_2TAG_PACKET_1_0_2); |
874 | |
875 | bind(L_2TAG_PACKET_3_0_2); |
876 | fld_d(Address(rsp, 128)); |
877 | fmul_d(Address(tmp, 2208)); |
878 | |
879 | bind(L_2TAG_PACKET_1_0_2); |
880 | movl(tmp, Address(rsp, 56)); |
881 | } |
882 | #endif |
883 | |