1/*
2* Copyright (c) 2016, Intel Corporation.
3* Intel Math Library (LIBM) Source Code
4*
5* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6*
7* This code is free software; you can redistribute it and/or modify it
8* under the terms of the GNU General Public License version 2 only, as
9* published by the Free Software Foundation.
10*
11* This code is distributed in the hope that it will be useful, but WITHOUT
12* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14* version 2 for more details (a copy is included in the LICENSE file that
15* accompanied this code).
16*
17* You should have received a copy of the GNU General Public License version
18* 2 along with this work; if not, write to the Free Software Foundation,
19* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20*
21* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22* or visit www.oracle.com if you need additional information or have any
23* questions.
24*
25*/
26
27#include "precompiled.hpp"
28#include "asm/assembler.hpp"
29#include "asm/assembler.inline.hpp"
30#include "macroAssembler_x86.hpp"
31#include "runtime/stubRoutines.hpp"
32#include "utilities/globalDefinitions.hpp"
33
34/******************************************************************************/
35// ALGORITHM DESCRIPTION - COS()
36// ---------------------
37//
38// 1. RANGE REDUCTION
39//
40// We perform an initial range reduction from X to r with
41//
42// X =~= N * pi/32 + r
43//
44// so that |r| <= pi/64 + epsilon. We restrict inputs to those
45// where |N| <= 932560. Beyond this, the range reduction is
46// insufficiently accurate. For extremely small inputs,
47// denormalization can occur internally, impacting performance.
48// This means that the main path is actually only taken for
49// 2^-252 <= |X| < 90112.
50//
51// To avoid branches, we perform the range reduction to full
52// accuracy each time.
53//
54// X - N * (P_1 + P_2 + P_3)
55//
56// where P_1 and P_2 are 32-bit numbers (so multiplication by N
57// is exact) and P_3 is a 53-bit number. Together, these
58// approximate pi well enough for all cases in the restricted
59// range.
60//
61// The main reduction sequence is:
62//
63// y = 32/pi * x
64// N = integer(y)
65// (computed by adding and subtracting off SHIFTER)
66//
67// m_1 = N * P_1
68// m_2 = N * P_2
69// r_1 = x - m_1
70// r = r_1 - m_2
71// (this r can be used for most of the calculation)
72//
73// c_1 = r_1 - r
74// m_3 = N * P_3
75// c_2 = c_1 - m_2
76// c = c_2 - m_3
77//
78// 2. MAIN ALGORITHM
79//
80// The algorithm uses a table lookup based on B = M * pi / 32
81// where M = N mod 64. The stored values are:
82// sigma closest power of 2 to cos(B)
83// C_hl 53-bit cos(B) - sigma
84// S_hi + S_lo 2 * 53-bit sin(B)
85//
86// The computation is organized as follows:
87//
88// sin(B + r + c) = [sin(B) + sigma * r] +
89// r * (cos(B) - sigma) +
90// sin(B) * [cos(r + c) - 1] +
91// cos(B) * [sin(r + c) - r]
92//
93// which is approximately:
94//
95// [S_hi + sigma * r] +
96// C_hl * r +
97// S_lo + S_hi * [(cos(r) - 1) - r * c] +
98// (C_hl + sigma) * [(sin(r) - r) + c]
99//
100// and this is what is actually computed. We separate this sum
101// into four parts:
102//
103// hi + med + pols + corr
104//
105// where
106//
107// hi = S_hi + sigma r
108// med = C_hl * r
109// pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
110// corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
111//
112// 3. POLYNOMIAL
113//
114// The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
115// (sin(r) - r) can be rearranged freely, since it is quite
116// small, so we exploit parallelism to the fullest.
117//
118// psc4 = SC_4 * r_1
119// msc4 = psc4 * r
120// r2 = r * r
121// msc2 = SC_2 * r2
122// r4 = r2 * r2
123// psc3 = SC_3 + msc4
124// psc1 = SC_1 + msc2
125// msc3 = r4 * psc3
126// sincospols = psc1 + msc3
127// pols = sincospols *
128// <S_hi * r^2 | (C_hl + sigma) * r^3>
129//
130// 4. CORRECTION TERM
131//
132// This is where the "c" component of the range reduction is
133// taken into account; recall that just "r" is used for most of
134// the calculation.
135//
136// -c = m_3 - c_2
137// -d = S_hi * r - (C_hl + sigma)
138// corr = -c * -d + S_lo
139//
140// 5. COMPENSATED SUMMATIONS
141//
142// The two successive compensated summations add up the high
143// and medium parts, leaving just the low parts to add up at
144// the end.
145//
146// rs = sigma * r
147// res_int = S_hi + rs
148// k_0 = S_hi - res_int
149// k_2 = k_0 + rs
150// med = C_hl * r
151// res_hi = res_int + med
152// k_1 = res_int - res_hi
153// k_3 = k_1 + med
154//
155// 6. FINAL SUMMATION
156//
157// We now add up all the small parts:
158//
159// res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
160//
161// Now the overall result is just:
162//
163// res_hi + res_lo
164//
165// 7. SMALL ARGUMENTS
166//
167// Inputs with |X| < 2^-252 are treated specially as
168// 1 - |x|.
169//
170// Special cases:
171// cos(NaN) = quiet NaN, and raise invalid exception
172// cos(INF) = NaN and raise invalid exception
173// cos(0) = 1
174//
175/******************************************************************************/
176
177#ifdef _LP64
178// The 64 bit code is at most SSE2 compliant
179ATTRIBUTE_ALIGNED(8) juint _ONE[] =
180{
181 0x00000000UL, 0x3ff00000UL
182};
183void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) {
184
185 Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
186 Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
187 Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
188 Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4, start;
189
190 assert_different_registers(r8, r9, r10, r11, eax, ecx, edx);
191
192 address ONEHALF = StubRoutines::x86::_ONEHALF_addr();
193 address P_2 = StubRoutines::x86::_P_2_addr();
194 address SC_4 = StubRoutines::x86::_SC_4_addr();
195 address Ctable = StubRoutines::x86::_Ctable_addr();
196 address SC_2 = StubRoutines::x86::_SC_2_addr();
197 address SC_3 = StubRoutines::x86::_SC_3_addr();
198 address SC_1 = StubRoutines::x86::_SC_1_addr();
199 address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr();
200 address PI_4 = (address)StubRoutines::x86::_PI_4_addr();
201 address PI32INV = (address)StubRoutines::x86::_PI32INV_addr();
202 address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr();
203 address P_1 = (address)StubRoutines::x86::_P_1_addr();
204 address P_3 = (address)StubRoutines::x86::_P_3_addr();
205 address ONE = (address)_ONE;
206 address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr();
207
208 bind(start);
209 push(rbx);
210 subq(rsp, 16);
211 movsd(Address(rsp, 8), xmm0);
212
213 bind(B1_2);
214 movl(eax, Address(rsp, 12));
215 movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL
216 andl(eax, 2147418112);
217 subl(eax, 808452096);
218 cmpl(eax, 281346048);
219 jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
220 mulsd(xmm1, xmm0);
221 movdqu(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
222 movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL
223 pand(xmm4, xmm0);
224 por(xmm5, xmm4);
225 addpd(xmm1, xmm5);
226 cvttsd2sil(edx, xmm1);
227 cvtsi2sdl(xmm1, edx);
228 movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
229 movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL
230 mulsd(xmm3, xmm1);
231 unpcklpd(xmm1, xmm1);
232 addq(rdx, 1865232);
233 movdqu(xmm4, xmm0);
234 andq(rdx, 63);
235 movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
236 lea(rax, ExternalAddress(Ctable));
237 shlq(rdx, 5);
238 addq(rax, rdx);
239 mulpd(xmm2, xmm1);
240 subsd(xmm0, xmm3);
241 mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL
242 subsd(xmm4, xmm3);
243 movq(xmm7, Address(rax, 8));
244 unpcklpd(xmm0, xmm0);
245 movdqu(xmm3, xmm4);
246 subsd(xmm4, xmm2);
247 mulpd(xmm5, xmm0);
248 subpd(xmm0, xmm2);
249 movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
250 mulsd(xmm7, xmm4);
251 subsd(xmm3, xmm4);
252 mulpd(xmm5, xmm0);
253 mulpd(xmm0, xmm0);
254 subsd(xmm3, xmm2);
255 movdqu(xmm2, Address(rax, 0));
256 subsd(xmm1, xmm3);
257 movq(xmm3, Address(rax, 24));
258 addsd(xmm2, xmm3);
259 subsd(xmm7, xmm2);
260 mulsd(xmm2, xmm4);
261 mulpd(xmm6, xmm0);
262 mulsd(xmm3, xmm4);
263 mulpd(xmm2, xmm0);
264 mulpd(xmm0, xmm0);
265 addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
266 mulsd(xmm4, Address(rax, 0));
267 addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
268 mulpd(xmm5, xmm0);
269 movdqu(xmm0, xmm3);
270 addsd(xmm3, Address(rax, 8));
271 mulpd(xmm1, xmm7);
272 movdqu(xmm7, xmm4);
273 addsd(xmm4, xmm3);
274 addpd(xmm6, xmm5);
275 movq(xmm5, Address(rax, 8));
276 subsd(xmm5, xmm3);
277 subsd(xmm3, xmm4);
278 addsd(xmm1, Address(rax, 16));
279 mulpd(xmm6, xmm2);
280 addsd(xmm0, xmm5);
281 addsd(xmm3, xmm7);
282 addsd(xmm0, xmm1);
283 addsd(xmm0, xmm3);
284 addsd(xmm0, xmm6);
285 unpckhpd(xmm6, xmm6);
286 addsd(xmm0, xmm6);
287 addsd(xmm0, xmm4);
288 jmp(B1_4);
289
290 bind(L_2TAG_PACKET_0_0_1);
291 jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
292 pextrw(eax, xmm0, 3);
293 andl(eax, 32767);
294 pinsrw(xmm0, eax, 3);
295 movq(xmm1, ExternalAddress(ONE)); //0x00000000UL, 0x3ff00000UL
296 subsd(xmm1, xmm0);
297 movdqu(xmm0, xmm1);
298 jmp(B1_4);
299
300 bind(L_2TAG_PACKET_1_0_1);
301 pextrw(eax, xmm0, 3);
302 andl(eax, 32752);
303 cmpl(eax, 32752);
304 jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
305 pextrw(ecx, xmm0, 3);
306 andl(ecx, 32752);
307 subl(ecx, 16224);
308 shrl(ecx, 7);
309 andl(ecx, 65532);
310 lea(r11, ExternalAddress(PI_INV_TABLE));
311 addq(rcx, r11);
312 movdq(rax, xmm0);
313 movl(r10, Address(rcx, 20));
314 movl(r8, Address(rcx, 24));
315 movl(edx, eax);
316 shrq(rax, 21);
317 orl(eax, INT_MIN);
318 shrl(eax, 11);
319 movl(r9, r10);
320 imulq(r10, rdx);
321 imulq(r9, rax);
322 imulq(r8, rax);
323 movl(rsi, Address(rcx, 16));
324 movl(rdi, Address(rcx, 12));
325 movl(r11, r10);
326 shrq(r10, 32);
327 addq(r9, r10);
328 addq(r11, r8);
329 movl(r8, r11);
330 shrq(r11, 32);
331 addq(r9, r11);
332 movl(r10, rsi);
333 imulq(rsi, rdx);
334 imulq(r10, rax);
335 movl(r11, rdi);
336 imulq(rdi, rdx);
337 movl(rbx, rsi);
338 shrq(rsi, 32);
339 addq(r9, rbx);
340 movl(rbx, r9);
341 shrq(r9, 32);
342 addq(r10, rsi);
343 addq(r10, r9);
344 shlq(rbx, 32);
345 orq(r8, rbx);
346 imulq(r11, rax);
347 movl(r9, Address(rcx, 8));
348 movl(rsi, Address(rcx, 4));
349 movl(rbx, rdi);
350 shrq(rdi, 32);
351 addq(r10, rbx);
352 movl(rbx, r10);
353 shrq(r10, 32);
354 addq(r11, rdi);
355 addq(r11, r10);
356 movq(rdi, r9);
357 imulq(r9, rdx);
358 imulq(rdi, rax);
359 movl(r10, r9);
360 shrq(r9, 32);
361 addq(r11, r10);
362 movl(r10, r11);
363 shrq(r11, 32);
364 addq(rdi, r9);
365 addq(rdi, r11);
366 movq(r9, rsi);
367 imulq(rsi, rdx);
368 imulq(r9, rax);
369 shlq(r10, 32);
370 orq(r10, rbx);
371 movl(eax, Address(rcx, 0));
372 movl(r11, rsi);
373 shrq(rsi, 32);
374 addq(rdi, r11);
375 movl(r11, rdi);
376 shrq(rdi, 32);
377 addq(r9, rsi);
378 addq(r9, rdi);
379 imulq(rdx, rax);
380 pextrw(rbx, xmm0, 3);
381 lea(rdi, ExternalAddress(PI_INV_TABLE));
382 subq(rcx, rdi);
383 addl(ecx, ecx);
384 addl(ecx, ecx);
385 addl(ecx, ecx);
386 addl(ecx, 19);
387 movl(rsi, 32768);
388 andl(rsi, rbx);
389 shrl(rbx, 4);
390 andl(rbx, 2047);
391 subl(rbx, 1023);
392 subl(ecx, rbx);
393 addq(r9, rdx);
394 movl(edx, ecx);
395 addl(edx, 32);
396 cmpl(ecx, 1);
397 jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
398 negl(ecx);
399 addl(ecx, 29);
400 shll(r9);
401 movl(rdi, r9);
402 andl(r9, 536870911);
403 testl(r9, 268435456);
404 jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
405 shrl(r9);
406 movl(rbx, 0);
407 shlq(r9, 32);
408 orq(r9, r11);
409
410 bind(L_2TAG_PACKET_5_0_1);
411
412 bind(L_2TAG_PACKET_6_0_1);
413 cmpq(r9, 0);
414 jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
415
416 bind(L_2TAG_PACKET_8_0_1);
417 bsrq(r11, r9);
418 movl(ecx, 29);
419 subl(ecx, r11);
420 jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
421 shlq(r9);
422 movq(rax, r10);
423 shlq(r10);
424 addl(edx, ecx);
425 negl(ecx);
426 addl(ecx, 64);
427 shrq(rax);
428 shrq(r8);
429 orq(r9, rax);
430 orq(r10, r8);
431
432 bind(L_2TAG_PACKET_10_0_1);
433 cvtsi2sdq(xmm0, r9);
434 shrq(r10, 1);
435 cvtsi2sdq(xmm3, r10);
436 xorpd(xmm4, xmm4);
437 shll(edx, 4);
438 negl(edx);
439 addl(edx, 16368);
440 orl(edx, rsi);
441 xorl(edx, rbx);
442 pinsrw(xmm4, edx, 3);
443 movq(xmm2, ExternalAddress(PI_4)); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
444 movq(xmm6, ExternalAddress(8 + PI_4)); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
445 xorpd(xmm5, xmm5);
446 subl(edx, 1008);
447 pinsrw(xmm5, edx, 3);
448 mulsd(xmm0, xmm4);
449 shll(rsi, 16);
450 sarl(rsi, 31);
451 mulsd(xmm3, xmm5);
452 movdqu(xmm1, xmm0);
453 mulsd(xmm0, xmm2);
454 shrl(rdi, 29);
455 addsd(xmm1, xmm3);
456 mulsd(xmm3, xmm2);
457 addl(rdi, rsi);
458 xorl(rdi, rsi);
459 mulsd(xmm6, xmm1);
460 movl(eax, rdi);
461 addsd(xmm6, xmm3);
462 movdqu(xmm2, xmm0);
463 addsd(xmm0, xmm6);
464 subsd(xmm2, xmm0);
465 addsd(xmm6, xmm2);
466
467 bind(L_2TAG_PACKET_11_0_1);
468 movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL
469 mulsd(xmm1, xmm0);
470 movq(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
471 movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL
472 pand(xmm4, xmm0);
473 por(xmm5, xmm4);
474 addpd(xmm1, xmm5);
475 cvttsd2siq(rdx, xmm1);
476 cvtsi2sdq(xmm1, rdx);
477 movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL
478 movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
479 mulsd(xmm3, xmm1);
480 unpcklpd(xmm1, xmm1);
481 shll(eax, 3);
482 addl(edx, 1865232);
483 movdqu(xmm4, xmm0);
484 addl(edx, eax);
485 andl(edx, 63);
486 movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
487 lea(rax, ExternalAddress(Ctable));
488 shll(edx, 5);
489 addq(rax, rdx);
490 mulpd(xmm2, xmm1);
491 subsd(xmm0, xmm3);
492 mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL
493 subsd(xmm4, xmm3);
494 movq(xmm7, Address(rax, 8));
495 unpcklpd(xmm0, xmm0);
496 movdqu(xmm3, xmm4);
497 subsd(xmm4, xmm2);
498 mulpd(xmm5, xmm0);
499 subpd(xmm0, xmm2);
500 mulsd(xmm7, xmm4);
501 subsd(xmm3, xmm4);
502 mulpd(xmm5, xmm0);
503 mulpd(xmm0, xmm0);
504 subsd(xmm3, xmm2);
505 movdqu(xmm2, Address(rax, 0));
506 subsd(xmm1, xmm3);
507 movq(xmm3, Address(rax, 24));
508 addsd(xmm2, xmm3);
509 subsd(xmm7, xmm2);
510 subsd(xmm1, xmm6);
511 movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
512 mulsd(xmm2, xmm4);
513 mulpd(xmm6, xmm0);
514 mulsd(xmm3, xmm4);
515 mulpd(xmm2, xmm0);
516 mulpd(xmm0, xmm0);
517 addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
518 mulsd(xmm4, Address(rax, 0));
519 addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
520 mulpd(xmm5, xmm0);
521 movdqu(xmm0, xmm3);
522 addsd(xmm3, Address(rax, 8));
523 mulpd(xmm1, xmm7);
524 movdqu(xmm7, xmm4);
525 addsd(xmm4, xmm3);
526 addpd(xmm6, xmm5);
527 movq(xmm5, Address(rax, 8));
528 subsd(xmm5, xmm3);
529 subsd(xmm3, xmm4);
530 addsd(xmm1, Address(rax, 16));
531 mulpd(xmm6, xmm2);
532 addsd(xmm5, xmm0);
533 addsd(xmm3, xmm7);
534 addsd(xmm1, xmm5);
535 addsd(xmm1, xmm3);
536 addsd(xmm1, xmm6);
537 unpckhpd(xmm6, xmm6);
538 movdqu(xmm0, xmm4);
539 addsd(xmm1, xmm6);
540 addsd(xmm0, xmm1);
541 jmp(B1_4);
542
543 bind(L_2TAG_PACKET_7_0_1);
544 addl(edx, 64);
545 movq(r9, r10);
546 movq(r10, r8);
547 movl(r8, 0);
548 cmpq(r9, 0);
549 jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
550 addl(edx, 64);
551 movq(r9, r10);
552 movq(r10, r8);
553 cmpq(r9, 0);
554 jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
555 xorpd(xmm0, xmm0);
556 xorpd(xmm6, xmm6);
557 jmp(L_2TAG_PACKET_11_0_1);
558
559 bind(L_2TAG_PACKET_9_0_1);
560 jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
561 negl(ecx);
562 shrq(r10);
563 movq(rax, r9);
564 shrq(r9);
565 subl(edx, ecx);
566 negl(ecx);
567 addl(ecx, 64);
568 shlq(rax);
569 orq(r10, rax);
570 jmp(L_2TAG_PACKET_10_0_1);
571 bind(L_2TAG_PACKET_3_0_1);
572 negl(ecx);
573 shlq(r9, 32);
574 orq(r9, r11);
575 shlq(r9);
576 movq(rdi, r9);
577 testl(r9, INT_MIN);
578 jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
579 shrl(r9);
580 movl(rbx, 0);
581 shrq(rdi, 3);
582 jmp(L_2TAG_PACKET_6_0_1);
583
584 bind(L_2TAG_PACKET_4_0_1);
585 shrl(r9);
586 movl(rbx, 536870912);
587 shrl(rbx);
588 shlq(r9, 32);
589 orq(r9, r11);
590 shlq(rbx, 32);
591 addl(rdi, 536870912);
592 movl(rcx, 0);
593 movl(r11, 0);
594 subq(rcx, r8);
595 sbbq(r11, r10);
596 sbbq(rbx, r9);
597 movq(r8, rcx);
598 movq(r10, r11);
599 movq(r9, rbx);
600 movl(rbx, 32768);
601 jmp(L_2TAG_PACKET_5_0_1);
602
603 bind(L_2TAG_PACKET_12_0_1);
604 shrl(r9);
605 mov64(rbx, 0x100000000);
606 shrq(rbx);
607 movl(rcx, 0);
608 movl(r11, 0);
609 subq(rcx, r8);
610 sbbq(r11, r10);
611 sbbq(rbx, r9);
612 movq(r8, rcx);
613 movq(r10, r11);
614 movq(r9, rbx);
615 movl(rbx, 32768);
616 shrq(rdi, 3);
617 addl(rdi, 536870912);
618 jmp(L_2TAG_PACKET_6_0_1);
619
620 bind(L_2TAG_PACKET_2_0_1);
621 movsd(xmm0, Address(rsp, 8));
622 mulsd(xmm0, ExternalAddress(NEG_ZERO)); //0x00000000UL, 0x80000000UL
623 movq(Address(rsp, 0), xmm0);
624
625 bind(L_2TAG_PACKET_13_0_1);
626
627 bind(B1_4);
628 addq(rsp, 16);
629 pop(rbx);
630}
631#else
632// The 32 bit code is at most SSE2 compliant
633
634ATTRIBUTE_ALIGNED(16) juint _static_const_table_cos[] =
635{
636 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
637 0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL,
638 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
639 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL,
640 0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL,
641 0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL,
642 0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL,
643 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL,
644 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL,
645 0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL,
646 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
647 0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL,
648 0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL,
649 0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL,
650 0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL,
651 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL,
652 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL,
653 0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL,
654 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
655 0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL,
656 0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL,
657 0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL,
658 0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL,
659 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL,
660 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL,
661 0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL,
662 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
663 0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL,
664 0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL,
665 0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL,
666 0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL,
667 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL,
668 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL,
669 0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL,
670 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
671 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL,
672 0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL,
673 0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL,
674 0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL,
675 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL,
676 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL,
677 0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL,
678 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
679 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL,
680 0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL,
681 0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL,
682 0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL,
683 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL,
684 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL,
685 0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL,
686 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
687 0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
688 0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL,
689 0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
690 0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL,
691 0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL,
692 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL,
693 0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL,
694 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL,
695 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL,
696 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL,
697 0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
698 0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL,
699 0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL,
700 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL,
701 0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL,
702 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL,
703 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL,
704 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL,
705 0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
706 0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL,
707 0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL,
708 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL,
709 0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL,
710 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL,
711 0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL,
712 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL,
713 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL,
714 0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL,
715 0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL,
716 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL,
717 0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL,
718 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL,
719 0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL,
720 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL,
721 0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
722 0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL,
723 0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL,
724 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL,
725 0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL,
726 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL,
727 0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL,
728 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL,
729 0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
730 0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL,
731 0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL,
732 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL,
733 0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL,
734 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL,
735 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL,
736 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL,
737 0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
738 0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL,
739 0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL,
740 0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL,
741 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL,
742 0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL,
743 0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL,
744 0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
745 0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL,
746 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL,
747 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL,
748 0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
749};
750//registers,
751// input: (rbp + 8)
752// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
753// rax, rdx, rcx, rbx (tmp)
754
755// Code generated by Intel C compiler for LIBM library
756
757void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
758 Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
759 Label start;
760
761 assert_different_registers(tmp, eax, ecx, edx);
762
763 address static_const_table_cos = (address)_static_const_table_cos;
764
765 bind(start);
766 subl(rsp, 120);
767 movl(Address(rsp, 56), tmp);
768 lea(tmp, ExternalAddress(static_const_table_cos));
769 movsd(xmm0, Address(rsp, 128));
770 pextrw(eax, xmm0, 3);
771 andl(eax, 32767);
772 subl(eax, 12336);
773 cmpl(eax, 4293);
774 jcc(Assembler::above, L_2TAG_PACKET_0_0_2);
775 movsd(xmm1, Address(tmp, 2160));
776 mulsd(xmm1, xmm0);
777 movdqu(xmm5, Address(tmp, 2240));
778 movsd(xmm4, Address(tmp, 2224));
779 pand(xmm4, xmm0);
780 por(xmm5, xmm4);
781 movsd(xmm3, Address(tmp, 2128));
782 movdqu(xmm2, Address(tmp, 2112));
783 addpd(xmm1, xmm5);
784 cvttsd2sil(edx, xmm1);
785 cvtsi2sdl(xmm1, edx);
786 mulsd(xmm3, xmm1);
787 unpcklpd(xmm1, xmm1);
788 addl(edx, 1865232);
789 movdqu(xmm4, xmm0);
790 andl(edx, 63);
791 movdqu(xmm5, Address(tmp, 2096));
792 lea(eax, Address(tmp, 0));
793 shll(edx, 5);
794 addl(eax, edx);
795 mulpd(xmm2, xmm1);
796 subsd(xmm0, xmm3);
797 mulsd(xmm1, Address(tmp, 2144));
798 subsd(xmm4, xmm3);
799 movsd(xmm7, Address(eax, 8));
800 unpcklpd(xmm0, xmm0);
801 movapd(xmm3, xmm4);
802 subsd(xmm4, xmm2);
803 mulpd(xmm5, xmm0);
804 subpd(xmm0, xmm2);
805 movdqu(xmm6, Address(tmp, 2064));
806 mulsd(xmm7, xmm4);
807 subsd(xmm3, xmm4);
808 mulpd(xmm5, xmm0);
809 mulpd(xmm0, xmm0);
810 subsd(xmm3, xmm2);
811 movdqu(xmm2, Address(eax, 0));
812 subsd(xmm1, xmm3);
813 movsd(xmm3, Address(eax, 24));
814 addsd(xmm2, xmm3);
815 subsd(xmm7, xmm2);
816 mulsd(xmm2, xmm4);
817 mulpd(xmm6, xmm0);
818 mulsd(xmm3, xmm4);
819 mulpd(xmm2, xmm0);
820 mulpd(xmm0, xmm0);
821 addpd(xmm5, Address(tmp, 2080));
822 mulsd(xmm4, Address(eax, 0));
823 addpd(xmm6, Address(tmp, 2048));
824 mulpd(xmm5, xmm0);
825 movapd(xmm0, xmm3);
826 addsd(xmm3, Address(eax, 8));
827 mulpd(xmm1, xmm7);
828 movapd(xmm7, xmm4);
829 addsd(xmm4, xmm3);
830 addpd(xmm6, xmm5);
831 movsd(xmm5, Address(eax, 8));
832 subsd(xmm5, xmm3);
833 subsd(xmm3, xmm4);
834 addsd(xmm1, Address(eax, 16));
835 mulpd(xmm6, xmm2);
836 addsd(xmm5, xmm0);
837 addsd(xmm3, xmm7);
838 addsd(xmm1, xmm5);
839 addsd(xmm1, xmm3);
840 addsd(xmm1, xmm6);
841 unpckhpd(xmm6, xmm6);
842 addsd(xmm1, xmm6);
843 addsd(xmm4, xmm1);
844 movsd(Address(rsp, 0), xmm4);
845 fld_d(Address(rsp, 0));
846 jmp(L_2TAG_PACKET_1_0_2);
847
848 bind(L_2TAG_PACKET_0_0_2);
849 jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
850 pextrw(eax, xmm0, 3);
851 andl(eax, 32767);
852 pinsrw(xmm0, eax, 3);
853 movsd(xmm1, Address(tmp, 2192));
854 subsd(xmm1, xmm0);
855 movsd(Address(rsp, 0), xmm1);
856 fld_d(Address(rsp, 0));
857 jmp(L_2TAG_PACKET_1_0_2);
858
859 bind(L_2TAG_PACKET_2_0_2);
860 movl(eax, Address(rsp, 132));
861 andl(eax, 2146435072);
862 cmpl(eax, 2146435072);
863 jcc(Assembler::equal, L_2TAG_PACKET_3_0_2);
864 subl(rsp, 32);
865 movsd(Address(rsp, 0), xmm0);
866 lea(eax, Address(rsp, 40));
867 movl(Address(rsp, 8), eax);
868 movl(eax, 1);
869 movl(Address(rsp, 12), eax);
870 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge())));
871 addl(rsp, 32);
872 fld_d(Address(rsp, 8));
873 jmp(L_2TAG_PACKET_1_0_2);
874
875 bind(L_2TAG_PACKET_3_0_2);
876 fld_d(Address(rsp, 128));
877 fmul_d(Address(tmp, 2208));
878
879 bind(L_2TAG_PACKET_1_0_2);
880 movl(tmp, Address(rsp, 56));
881}
882#endif
883