1/*
2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/macroAssembler.hpp"
27#include "asm/macroAssembler.inline.hpp"
28#include "ci/ciUtilities.hpp"
29#include "gc/shared/barrierSet.hpp"
30#include "gc/shared/barrierSetAssembler.hpp"
31#include "gc/shared/barrierSetNMethod.hpp"
32#include "interpreter/interpreter.hpp"
33#include "memory/universe.hpp"
34#include "nativeInst_x86.hpp"
35#include "oops/instanceOop.hpp"
36#include "oops/method.hpp"
37#include "oops/objArrayKlass.hpp"
38#include "oops/oop.inline.hpp"
39#include "prims/methodHandles.hpp"
40#include "runtime/frame.inline.hpp"
41#include "runtime/handles.inline.hpp"
42#include "runtime/sharedRuntime.hpp"
43#include "runtime/stubCodeGenerator.hpp"
44#include "runtime/stubRoutines.hpp"
45#include "runtime/thread.inline.hpp"
46#ifdef COMPILER2
47#include "opto/runtime.hpp"
48#endif
49#if INCLUDE_ZGC
50#include "gc/z/zThreadLocalData.hpp"
51#endif
52
53// Declaration and definition of StubGenerator (no .hpp file).
54// For a more detailed description of the stub routine structure
55// see the comment in stubRoutines.hpp
56
57#define __ _masm->
58#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
59#define a__ ((Assembler*)_masm)->
60
61#ifdef PRODUCT
62#define BLOCK_COMMENT(str) /* nothing */
63#else
64#define BLOCK_COMMENT(str) __ block_comment(str)
65#endif
66
67#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
69
70// Stub Code definitions
71
72class StubGenerator: public StubCodeGenerator {
73 private:
74
75#ifdef PRODUCT
76#define inc_counter_np(counter) ((void)0)
77#else
78 void inc_counter_np_(int& counter) {
79 // This can destroy rscratch1 if counter is far from the code cache
80 __ incrementl(ExternalAddress((address)&counter));
81 }
82#define inc_counter_np(counter) \
83 BLOCK_COMMENT("inc_counter " #counter); \
84 inc_counter_np_(counter);
85#endif
86
87 // Call stubs are used to call Java from C
88 //
89 // Linux Arguments:
90 // c_rarg0: call wrapper address address
91 // c_rarg1: result address
92 // c_rarg2: result type BasicType
93 // c_rarg3: method Method*
94 // c_rarg4: (interpreter) entry point address
95 // c_rarg5: parameters intptr_t*
96 // 16(rbp): parameter size (in words) int
97 // 24(rbp): thread Thread*
98 //
99 // [ return_from_Java ] <--- rsp
100 // [ argument word n ]
101 // ...
102 // -12 [ argument word 1 ]
103 // -11 [ saved r15 ] <--- rsp_after_call
104 // -10 [ saved r14 ]
105 // -9 [ saved r13 ]
106 // -8 [ saved r12 ]
107 // -7 [ saved rbx ]
108 // -6 [ call wrapper ]
109 // -5 [ result ]
110 // -4 [ result type ]
111 // -3 [ method ]
112 // -2 [ entry point ]
113 // -1 [ parameters ]
114 // 0 [ saved rbp ] <--- rbp
115 // 1 [ return address ]
116 // 2 [ parameter size ]
117 // 3 [ thread ]
118 //
119 // Windows Arguments:
120 // c_rarg0: call wrapper address address
121 // c_rarg1: result address
122 // c_rarg2: result type BasicType
123 // c_rarg3: method Method*
124 // 48(rbp): (interpreter) entry point address
125 // 56(rbp): parameters intptr_t*
126 // 64(rbp): parameter size (in words) int
127 // 72(rbp): thread Thread*
128 //
129 // [ return_from_Java ] <--- rsp
130 // [ argument word n ]
131 // ...
132 // -60 [ argument word 1 ]
133 // -59 [ saved xmm31 ] <--- rsp after_call
134 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
135 // -27 [ saved xmm15 ]
136 // [ saved xmm7-xmm14 ]
137 // -9 [ saved xmm6 ] (each xmm register takes 2 slots)
138 // -7 [ saved r15 ]
139 // -6 [ saved r14 ]
140 // -5 [ saved r13 ]
141 // -4 [ saved r12 ]
142 // -3 [ saved rdi ]
143 // -2 [ saved rsi ]
144 // -1 [ saved rbx ]
145 // 0 [ saved rbp ] <--- rbp
146 // 1 [ return address ]
147 // 2 [ call wrapper ]
148 // 3 [ result ]
149 // 4 [ result type ]
150 // 5 [ method ]
151 // 6 [ entry point ]
152 // 7 [ parameters ]
153 // 8 [ parameter size ]
154 // 9 [ thread ]
155 //
156 // Windows reserves the callers stack space for arguments 1-4.
157 // We spill c_rarg0-c_rarg3 to this space.
158
159 // Call stub stack layout word offsets from rbp
160 enum call_stub_layout {
161#ifdef _WIN64
162 xmm_save_first = 6, // save from xmm6
163 xmm_save_last = 31, // to xmm31
164 xmm_save_base = -9,
165 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
166 r15_off = -7,
167 r14_off = -6,
168 r13_off = -5,
169 r12_off = -4,
170 rdi_off = -3,
171 rsi_off = -2,
172 rbx_off = -1,
173 rbp_off = 0,
174 retaddr_off = 1,
175 call_wrapper_off = 2,
176 result_off = 3,
177 result_type_off = 4,
178 method_off = 5,
179 entry_point_off = 6,
180 parameters_off = 7,
181 parameter_size_off = 8,
182 thread_off = 9
183#else
184 rsp_after_call_off = -12,
185 mxcsr_off = rsp_after_call_off,
186 r15_off = -11,
187 r14_off = -10,
188 r13_off = -9,
189 r12_off = -8,
190 rbx_off = -7,
191 call_wrapper_off = -6,
192 result_off = -5,
193 result_type_off = -4,
194 method_off = -3,
195 entry_point_off = -2,
196 parameters_off = -1,
197 rbp_off = 0,
198 retaddr_off = 1,
199 parameter_size_off = 2,
200 thread_off = 3
201#endif
202 };
203
204#ifdef _WIN64
205 Address xmm_save(int reg) {
206 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
207 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
208 }
209#endif
210
211 address generate_call_stub(address& return_address) {
212 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
213 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
214 "adjust this code");
215 StubCodeMark mark(this, "StubRoutines", "call_stub");
216 address start = __ pc();
217
218 // same as in generate_catch_exception()!
219 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
220
221 const Address call_wrapper (rbp, call_wrapper_off * wordSize);
222 const Address result (rbp, result_off * wordSize);
223 const Address result_type (rbp, result_type_off * wordSize);
224 const Address method (rbp, method_off * wordSize);
225 const Address entry_point (rbp, entry_point_off * wordSize);
226 const Address parameters (rbp, parameters_off * wordSize);
227 const Address parameter_size(rbp, parameter_size_off * wordSize);
228
229 // same as in generate_catch_exception()!
230 const Address thread (rbp, thread_off * wordSize);
231
232 const Address r15_save(rbp, r15_off * wordSize);
233 const Address r14_save(rbp, r14_off * wordSize);
234 const Address r13_save(rbp, r13_off * wordSize);
235 const Address r12_save(rbp, r12_off * wordSize);
236 const Address rbx_save(rbp, rbx_off * wordSize);
237
238 // stub code
239 __ enter();
240 __ subptr(rsp, -rsp_after_call_off * wordSize);
241
242 // save register parameters
243#ifndef _WIN64
244 __ movptr(parameters, c_rarg5); // parameters
245 __ movptr(entry_point, c_rarg4); // entry_point
246#endif
247
248 __ movptr(method, c_rarg3); // method
249 __ movl(result_type, c_rarg2); // result type
250 __ movptr(result, c_rarg1); // result
251 __ movptr(call_wrapper, c_rarg0); // call wrapper
252
253 // save regs belonging to calling function
254 __ movptr(rbx_save, rbx);
255 __ movptr(r12_save, r12);
256 __ movptr(r13_save, r13);
257 __ movptr(r14_save, r14);
258 __ movptr(r15_save, r15);
259
260#ifdef _WIN64
261 int last_reg = 15;
262 if (UseAVX > 2) {
263 last_reg = 31;
264 }
265 if (VM_Version::supports_evex()) {
266 for (int i = xmm_save_first; i <= last_reg; i++) {
267 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
268 }
269 } else {
270 for (int i = xmm_save_first; i <= last_reg; i++) {
271 __ movdqu(xmm_save(i), as_XMMRegister(i));
272 }
273 }
274
275 const Address rdi_save(rbp, rdi_off * wordSize);
276 const Address rsi_save(rbp, rsi_off * wordSize);
277
278 __ movptr(rsi_save, rsi);
279 __ movptr(rdi_save, rdi);
280#else
281 const Address mxcsr_save(rbp, mxcsr_off * wordSize);
282 {
283 Label skip_ldmx;
284 __ stmxcsr(mxcsr_save);
285 __ movl(rax, mxcsr_save);
286 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
287 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
288 __ cmp32(rax, mxcsr_std);
289 __ jcc(Assembler::equal, skip_ldmx);
290 __ ldmxcsr(mxcsr_std);
291 __ bind(skip_ldmx);
292 }
293#endif
294
295 // Load up thread register
296 __ movptr(r15_thread, thread);
297 __ reinit_heapbase();
298
299#ifdef ASSERT
300 // make sure we have no pending exceptions
301 {
302 Label L;
303 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
304 __ jcc(Assembler::equal, L);
305 __ stop("StubRoutines::call_stub: entered with pending exception");
306 __ bind(L);
307 }
308#endif
309
310 // pass parameters if any
311 BLOCK_COMMENT("pass parameters if any");
312 Label parameters_done;
313 __ movl(c_rarg3, parameter_size);
314 __ testl(c_rarg3, c_rarg3);
315 __ jcc(Assembler::zero, parameters_done);
316
317 Label loop;
318 __ movptr(c_rarg2, parameters); // parameter pointer
319 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1
320 __ BIND(loop);
321 __ movptr(rax, Address(c_rarg2, 0));// get parameter
322 __ addptr(c_rarg2, wordSize); // advance to next parameter
323 __ decrementl(c_rarg1); // decrement counter
324 __ push(rax); // pass parameter
325 __ jcc(Assembler::notZero, loop);
326
327 // call Java function
328 __ BIND(parameters_done);
329 __ movptr(rbx, method); // get Method*
330 __ movptr(c_rarg1, entry_point); // get entry_point
331 __ mov(r13, rsp); // set sender sp
332 BLOCK_COMMENT("call Java function");
333 __ call(c_rarg1);
334
335 BLOCK_COMMENT("call_stub_return_address:");
336 return_address = __ pc();
337
338 // store result depending on type (everything that is not
339 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
340 __ movptr(c_rarg0, result);
341 Label is_long, is_float, is_double, exit;
342 __ movl(c_rarg1, result_type);
343 __ cmpl(c_rarg1, T_OBJECT);
344 __ jcc(Assembler::equal, is_long);
345 __ cmpl(c_rarg1, T_LONG);
346 __ jcc(Assembler::equal, is_long);
347 __ cmpl(c_rarg1, T_FLOAT);
348 __ jcc(Assembler::equal, is_float);
349 __ cmpl(c_rarg1, T_DOUBLE);
350 __ jcc(Assembler::equal, is_double);
351
352 // handle T_INT case
353 __ movl(Address(c_rarg0, 0), rax);
354
355 __ BIND(exit);
356
357 // pop parameters
358 __ lea(rsp, rsp_after_call);
359
360#ifdef ASSERT
361 // verify that threads correspond
362 {
363 Label L1, L2, L3;
364 __ cmpptr(r15_thread, thread);
365 __ jcc(Assembler::equal, L1);
366 __ stop("StubRoutines::call_stub: r15_thread is corrupted");
367 __ bind(L1);
368 __ get_thread(rbx);
369 __ cmpptr(r15_thread, thread);
370 __ jcc(Assembler::equal, L2);
371 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
372 __ bind(L2);
373 __ cmpptr(r15_thread, rbx);
374 __ jcc(Assembler::equal, L3);
375 __ stop("StubRoutines::call_stub: threads must correspond");
376 __ bind(L3);
377 }
378#endif
379
380 // restore regs belonging to calling function
381#ifdef _WIN64
382 // emit the restores for xmm regs
383 if (VM_Version::supports_evex()) {
384 for (int i = xmm_save_first; i <= last_reg; i++) {
385 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
386 }
387 } else {
388 for (int i = xmm_save_first; i <= last_reg; i++) {
389 __ movdqu(as_XMMRegister(i), xmm_save(i));
390 }
391 }
392#endif
393 __ movptr(r15, r15_save);
394 __ movptr(r14, r14_save);
395 __ movptr(r13, r13_save);
396 __ movptr(r12, r12_save);
397 __ movptr(rbx, rbx_save);
398
399#ifdef _WIN64
400 __ movptr(rdi, rdi_save);
401 __ movptr(rsi, rsi_save);
402#else
403 __ ldmxcsr(mxcsr_save);
404#endif
405
406 // restore rsp
407 __ addptr(rsp, -rsp_after_call_off * wordSize);
408
409 // return
410 __ vzeroupper();
411 __ pop(rbp);
412 __ ret(0);
413
414 // handle return types different from T_INT
415 __ BIND(is_long);
416 __ movq(Address(c_rarg0, 0), rax);
417 __ jmp(exit);
418
419 __ BIND(is_float);
420 __ movflt(Address(c_rarg0, 0), xmm0);
421 __ jmp(exit);
422
423 __ BIND(is_double);
424 __ movdbl(Address(c_rarg0, 0), xmm0);
425 __ jmp(exit);
426
427 return start;
428 }
429
430 // Return point for a Java call if there's an exception thrown in
431 // Java code. The exception is caught and transformed into a
432 // pending exception stored in JavaThread that can be tested from
433 // within the VM.
434 //
435 // Note: Usually the parameters are removed by the callee. In case
436 // of an exception crossing an activation frame boundary, that is
437 // not the case if the callee is compiled code => need to setup the
438 // rsp.
439 //
440 // rax: exception oop
441
442 address generate_catch_exception() {
443 StubCodeMark mark(this, "StubRoutines", "catch_exception");
444 address start = __ pc();
445
446 // same as in generate_call_stub():
447 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
448 const Address thread (rbp, thread_off * wordSize);
449
450#ifdef ASSERT
451 // verify that threads correspond
452 {
453 Label L1, L2, L3;
454 __ cmpptr(r15_thread, thread);
455 __ jcc(Assembler::equal, L1);
456 __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
457 __ bind(L1);
458 __ get_thread(rbx);
459 __ cmpptr(r15_thread, thread);
460 __ jcc(Assembler::equal, L2);
461 __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
462 __ bind(L2);
463 __ cmpptr(r15_thread, rbx);
464 __ jcc(Assembler::equal, L3);
465 __ stop("StubRoutines::catch_exception: threads must correspond");
466 __ bind(L3);
467 }
468#endif
469
470 // set pending exception
471 __ verify_oop(rax);
472
473 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
474 __ lea(rscratch1, ExternalAddress((address)__FILE__));
475 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
476 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__);
477
478 // complete return to VM
479 assert(StubRoutines::_call_stub_return_address != NULL,
480 "_call_stub_return_address must have been generated before");
481 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
482
483 return start;
484 }
485
486 // Continuation point for runtime calls returning with a pending
487 // exception. The pending exception check happened in the runtime
488 // or native call stub. The pending exception in Thread is
489 // converted into a Java-level exception.
490 //
491 // Contract with Java-level exception handlers:
492 // rax: exception
493 // rdx: throwing pc
494 //
495 // NOTE: At entry of this stub, exception-pc must be on stack !!
496
497 address generate_forward_exception() {
498 StubCodeMark mark(this, "StubRoutines", "forward exception");
499 address start = __ pc();
500
501 // Upon entry, the sp points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510#ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
515 __ jcc(Assembler::notEqual, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519#endif
520
521 // compute exception handler into rbx
522 __ movptr(c_rarg0, Address(rsp, 0));
523 BLOCK_COMMENT("call exception_handler_for_return_address");
524 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
525 SharedRuntime::exception_handler_for_return_address),
526 r15_thread, c_rarg0);
527 __ mov(rbx, rax);
528
529 // setup rax & rdx, remove return address & clear pending exception
530 __ pop(rdx);
531 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
532 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
533
534#ifdef ASSERT
535 // make sure exception is set
536 {
537 Label L;
538 __ testptr(rax, rax);
539 __ jcc(Assembler::notEqual, L);
540 __ stop("StubRoutines::forward exception: no pending exception (2)");
541 __ bind(L);
542 }
543#endif
544
545 // continue at exception handler (return address removed)
546 // rax: exception
547 // rbx: exception handler
548 // rdx: throwing pc
549 __ verify_oop(rax);
550 __ jmp(rbx);
551
552 return start;
553 }
554
555 // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
556 //
557 // Arguments :
558 // c_rarg0: exchange_value
559 // c_rarg0: dest
560 //
561 // Result:
562 // *dest <- ex, return (orig *dest)
563 address generate_atomic_xchg() {
564 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
565 address start = __ pc();
566
567 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
568 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
569 __ ret(0);
570
571 return start;
572 }
573
574 // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
575 //
576 // Arguments :
577 // c_rarg0: exchange_value
578 // c_rarg1: dest
579 //
580 // Result:
581 // *dest <- ex, return (orig *dest)
582 address generate_atomic_xchg_long() {
583 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
584 address start = __ pc();
585
586 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
587 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
588 __ ret(0);
589
590 return start;
591 }
592
593 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
594 // jint compare_value)
595 //
596 // Arguments :
597 // c_rarg0: exchange_value
598 // c_rarg1: dest
599 // c_rarg2: compare_value
600 //
601 // Result:
602 // if ( compare_value == *dest ) {
603 // *dest = exchange_value
604 // return compare_value;
605 // else
606 // return *dest;
607 address generate_atomic_cmpxchg() {
608 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
609 address start = __ pc();
610
611 __ movl(rax, c_rarg2);
612 __ lock();
613 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
614 __ ret(0);
615
616 return start;
617 }
618
619 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
620 // int8_t compare_value)
621 //
622 // Arguments :
623 // c_rarg0: exchange_value
624 // c_rarg1: dest
625 // c_rarg2: compare_value
626 //
627 // Result:
628 // if ( compare_value == *dest ) {
629 // *dest = exchange_value
630 // return compare_value;
631 // else
632 // return *dest;
633 address generate_atomic_cmpxchg_byte() {
634 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
635 address start = __ pc();
636
637 __ movsbq(rax, c_rarg2);
638 __ lock();
639 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
640 __ ret(0);
641
642 return start;
643 }
644
645 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
646 // volatile int64_t* dest,
647 // int64_t compare_value)
648 // Arguments :
649 // c_rarg0: exchange_value
650 // c_rarg1: dest
651 // c_rarg2: compare_value
652 //
653 // Result:
654 // if ( compare_value == *dest ) {
655 // *dest = exchange_value
656 // return compare_value;
657 // else
658 // return *dest;
659 address generate_atomic_cmpxchg_long() {
660 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
661 address start = __ pc();
662
663 __ movq(rax, c_rarg2);
664 __ lock();
665 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
666 __ ret(0);
667
668 return start;
669 }
670
671 // Support for jint atomic::add(jint add_value, volatile jint* dest)
672 //
673 // Arguments :
674 // c_rarg0: add_value
675 // c_rarg1: dest
676 //
677 // Result:
678 // *dest += add_value
679 // return *dest;
680 address generate_atomic_add() {
681 StubCodeMark mark(this, "StubRoutines", "atomic_add");
682 address start = __ pc();
683
684 __ movl(rax, c_rarg0);
685 __ lock();
686 __ xaddl(Address(c_rarg1, 0), c_rarg0);
687 __ addl(rax, c_rarg0);
688 __ ret(0);
689
690 return start;
691 }
692
693 // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
694 //
695 // Arguments :
696 // c_rarg0: add_value
697 // c_rarg1: dest
698 //
699 // Result:
700 // *dest += add_value
701 // return *dest;
702 address generate_atomic_add_long() {
703 StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
704 address start = __ pc();
705
706 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
707 __ lock();
708 __ xaddptr(Address(c_rarg1, 0), c_rarg0);
709 __ addptr(rax, c_rarg0);
710 __ ret(0);
711
712 return start;
713 }
714
715 // Support for intptr_t OrderAccess::fence()
716 //
717 // Arguments :
718 //
719 // Result:
720 address generate_orderaccess_fence() {
721 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
722 address start = __ pc();
723 __ membar(Assembler::StoreLoad);
724 __ ret(0);
725
726 return start;
727 }
728
729 // Support for intptr_t get_previous_fp()
730 //
731 // This routine is used to find the previous frame pointer for the
732 // caller (current_frame_guess). This is used as part of debugging
733 // ps() is seemingly lost trying to find frames.
734 // This code assumes that caller current_frame_guess) has a frame.
735 address generate_get_previous_fp() {
736 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
737 const Address old_fp(rbp, 0);
738 const Address older_fp(rax, 0);
739 address start = __ pc();
740
741 __ enter();
742 __ movptr(rax, old_fp); // callers fp
743 __ movptr(rax, older_fp); // the frame for ps()
744 __ pop(rbp);
745 __ ret(0);
746
747 return start;
748 }
749
750 // Support for intptr_t get_previous_sp()
751 //
752 // This routine is used to find the previous stack pointer for the
753 // caller.
754 address generate_get_previous_sp() {
755 StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
756 address start = __ pc();
757
758 __ movptr(rax, rsp);
759 __ addptr(rax, 8); // return address is at the top of the stack.
760 __ ret(0);
761
762 return start;
763 }
764
765 //----------------------------------------------------------------------------------------------------
766 // Support for void verify_mxcsr()
767 //
768 // This routine is used with -Xcheck:jni to verify that native
769 // JNI code does not return to Java code without restoring the
770 // MXCSR register to our expected state.
771
772 address generate_verify_mxcsr() {
773 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
774 address start = __ pc();
775
776 const Address mxcsr_save(rsp, 0);
777
778 if (CheckJNICalls) {
779 Label ok_ret;
780 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
781 __ push(rax);
782 __ subptr(rsp, wordSize); // allocate a temp location
783 __ stmxcsr(mxcsr_save);
784 __ movl(rax, mxcsr_save);
785 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
786 __ cmp32(rax, mxcsr_std);
787 __ jcc(Assembler::equal, ok_ret);
788
789 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
790
791 __ ldmxcsr(mxcsr_std);
792
793 __ bind(ok_ret);
794 __ addptr(rsp, wordSize);
795 __ pop(rax);
796 }
797
798 __ ret(0);
799
800 return start;
801 }
802
803 address generate_f2i_fixup() {
804 StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
805 Address inout(rsp, 5 * wordSize); // return address + 4 saves
806
807 address start = __ pc();
808
809 Label L;
810
811 __ push(rax);
812 __ push(c_rarg3);
813 __ push(c_rarg2);
814 __ push(c_rarg1);
815
816 __ movl(rax, 0x7f800000);
817 __ xorl(c_rarg3, c_rarg3);
818 __ movl(c_rarg2, inout);
819 __ movl(c_rarg1, c_rarg2);
820 __ andl(c_rarg1, 0x7fffffff);
821 __ cmpl(rax, c_rarg1); // NaN? -> 0
822 __ jcc(Assembler::negative, L);
823 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
824 __ movl(c_rarg3, 0x80000000);
825 __ movl(rax, 0x7fffffff);
826 __ cmovl(Assembler::positive, c_rarg3, rax);
827
828 __ bind(L);
829 __ movptr(inout, c_rarg3);
830
831 __ pop(c_rarg1);
832 __ pop(c_rarg2);
833 __ pop(c_rarg3);
834 __ pop(rax);
835
836 __ ret(0);
837
838 return start;
839 }
840
841 address generate_f2l_fixup() {
842 StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
843 Address inout(rsp, 5 * wordSize); // return address + 4 saves
844 address start = __ pc();
845
846 Label L;
847
848 __ push(rax);
849 __ push(c_rarg3);
850 __ push(c_rarg2);
851 __ push(c_rarg1);
852
853 __ movl(rax, 0x7f800000);
854 __ xorl(c_rarg3, c_rarg3);
855 __ movl(c_rarg2, inout);
856 __ movl(c_rarg1, c_rarg2);
857 __ andl(c_rarg1, 0x7fffffff);
858 __ cmpl(rax, c_rarg1); // NaN? -> 0
859 __ jcc(Assembler::negative, L);
860 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
861 __ mov64(c_rarg3, 0x8000000000000000);
862 __ mov64(rax, 0x7fffffffffffffff);
863 __ cmov(Assembler::positive, c_rarg3, rax);
864
865 __ bind(L);
866 __ movptr(inout, c_rarg3);
867
868 __ pop(c_rarg1);
869 __ pop(c_rarg2);
870 __ pop(c_rarg3);
871 __ pop(rax);
872
873 __ ret(0);
874
875 return start;
876 }
877
878 address generate_d2i_fixup() {
879 StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
880 Address inout(rsp, 6 * wordSize); // return address + 5 saves
881
882 address start = __ pc();
883
884 Label L;
885
886 __ push(rax);
887 __ push(c_rarg3);
888 __ push(c_rarg2);
889 __ push(c_rarg1);
890 __ push(c_rarg0);
891
892 __ movl(rax, 0x7ff00000);
893 __ movq(c_rarg2, inout);
894 __ movl(c_rarg3, c_rarg2);
895 __ mov(c_rarg1, c_rarg2);
896 __ mov(c_rarg0, c_rarg2);
897 __ negl(c_rarg3);
898 __ shrptr(c_rarg1, 0x20);
899 __ orl(c_rarg3, c_rarg2);
900 __ andl(c_rarg1, 0x7fffffff);
901 __ xorl(c_rarg2, c_rarg2);
902 __ shrl(c_rarg3, 0x1f);
903 __ orl(c_rarg1, c_rarg3);
904 __ cmpl(rax, c_rarg1);
905 __ jcc(Assembler::negative, L); // NaN -> 0
906 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
907 __ movl(c_rarg2, 0x80000000);
908 __ movl(rax, 0x7fffffff);
909 __ cmov(Assembler::positive, c_rarg2, rax);
910
911 __ bind(L);
912 __ movptr(inout, c_rarg2);
913
914 __ pop(c_rarg0);
915 __ pop(c_rarg1);
916 __ pop(c_rarg2);
917 __ pop(c_rarg3);
918 __ pop(rax);
919
920 __ ret(0);
921
922 return start;
923 }
924
925 address generate_d2l_fixup() {
926 StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
927 Address inout(rsp, 6 * wordSize); // return address + 5 saves
928
929 address start = __ pc();
930
931 Label L;
932
933 __ push(rax);
934 __ push(c_rarg3);
935 __ push(c_rarg2);
936 __ push(c_rarg1);
937 __ push(c_rarg0);
938
939 __ movl(rax, 0x7ff00000);
940 __ movq(c_rarg2, inout);
941 __ movl(c_rarg3, c_rarg2);
942 __ mov(c_rarg1, c_rarg2);
943 __ mov(c_rarg0, c_rarg2);
944 __ negl(c_rarg3);
945 __ shrptr(c_rarg1, 0x20);
946 __ orl(c_rarg3, c_rarg2);
947 __ andl(c_rarg1, 0x7fffffff);
948 __ xorl(c_rarg2, c_rarg2);
949 __ shrl(c_rarg3, 0x1f);
950 __ orl(c_rarg1, c_rarg3);
951 __ cmpl(rax, c_rarg1);
952 __ jcc(Assembler::negative, L); // NaN -> 0
953 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
954 __ mov64(c_rarg2, 0x8000000000000000);
955 __ mov64(rax, 0x7fffffffffffffff);
956 __ cmovq(Assembler::positive, c_rarg2, rax);
957
958 __ bind(L);
959 __ movq(inout, c_rarg2);
960
961 __ pop(c_rarg0);
962 __ pop(c_rarg1);
963 __ pop(c_rarg2);
964 __ pop(c_rarg3);
965 __ pop(rax);
966
967 __ ret(0);
968
969 return start;
970 }
971
972 address generate_fp_mask(const char *stub_name, int64_t mask) {
973 __ align(CodeEntryAlignment);
974 StubCodeMark mark(this, "StubRoutines", stub_name);
975 address start = __ pc();
976
977 __ emit_data64( mask, relocInfo::none );
978 __ emit_data64( mask, relocInfo::none );
979
980 return start;
981 }
982
983 address generate_vector_mask(const char *stub_name, int64_t mask) {
984 __ align(CodeEntryAlignment);
985 StubCodeMark mark(this, "StubRoutines", stub_name);
986 address start = __ pc();
987
988 __ emit_data64(mask, relocInfo::none);
989 __ emit_data64(mask, relocInfo::none);
990 __ emit_data64(mask, relocInfo::none);
991 __ emit_data64(mask, relocInfo::none);
992 __ emit_data64(mask, relocInfo::none);
993 __ emit_data64(mask, relocInfo::none);
994 __ emit_data64(mask, relocInfo::none);
995 __ emit_data64(mask, relocInfo::none);
996
997 return start;
998 }
999
1000 address generate_vector_byte_perm_mask(const char *stub_name) {
1001 __ align(CodeEntryAlignment);
1002 StubCodeMark mark(this, "StubRoutines", stub_name);
1003 address start = __ pc();
1004
1005 __ emit_data64(0x0000000000000001, relocInfo::none);
1006 __ emit_data64(0x0000000000000003, relocInfo::none);
1007 __ emit_data64(0x0000000000000005, relocInfo::none);
1008 __ emit_data64(0x0000000000000007, relocInfo::none);
1009 __ emit_data64(0x0000000000000000, relocInfo::none);
1010 __ emit_data64(0x0000000000000002, relocInfo::none);
1011 __ emit_data64(0x0000000000000004, relocInfo::none);
1012 __ emit_data64(0x0000000000000006, relocInfo::none);
1013
1014 return start;
1015 }
1016
1017 // Non-destructive plausibility checks for oops
1018 //
1019 // Arguments:
1020 // all args on stack!
1021 //
1022 // Stack after saving c_rarg3:
1023 // [tos + 0]: saved c_rarg3
1024 // [tos + 1]: saved c_rarg2
1025 // [tos + 2]: saved r12 (several TemplateTable methods use it)
1026 // [tos + 3]: saved flags
1027 // [tos + 4]: return address
1028 // * [tos + 5]: error message (char*)
1029 // * [tos + 6]: object to verify (oop)
1030 // * [tos + 7]: saved rax - saved by caller and bashed
1031 // * [tos + 8]: saved r10 (rscratch1) - saved by caller
1032 // * = popped on exit
1033 address generate_verify_oop() {
1034 StubCodeMark mark(this, "StubRoutines", "verify_oop");
1035 address start = __ pc();
1036
1037 Label exit, error;
1038
1039 __ pushf();
1040 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1041
1042 __ push(r12);
1043
1044 // save c_rarg2 and c_rarg3
1045 __ push(c_rarg2);
1046 __ push(c_rarg3);
1047
1048 enum {
1049 // After previous pushes.
1050 oop_to_verify = 6 * wordSize,
1051 saved_rax = 7 * wordSize,
1052 saved_r10 = 8 * wordSize,
1053
1054 // Before the call to MacroAssembler::debug(), see below.
1055 return_addr = 16 * wordSize,
1056 error_msg = 17 * wordSize
1057 };
1058
1059 // get object
1060 __ movptr(rax, Address(rsp, oop_to_verify));
1061
1062 // make sure object is 'reasonable'
1063 __ testptr(rax, rax);
1064 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1065
1066#if INCLUDE_ZGC
1067 if (UseZGC) {
1068 // Check if metadata bits indicate a bad oop
1069 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1070 __ jcc(Assembler::notZero, error);
1071 }
1072#endif
1073
1074 // Check if the oop is in the right area of memory
1075 __ movptr(c_rarg2, rax);
1076 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1077 __ andptr(c_rarg2, c_rarg3);
1078 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1079 __ cmpptr(c_rarg2, c_rarg3);
1080 __ jcc(Assembler::notZero, error);
1081
1082 // set r12 to heapbase for load_klass()
1083 __ reinit_heapbase();
1084
1085 // make sure klass is 'reasonable', which is not zero.
1086 __ load_klass(rax, rax); // get klass
1087 __ testptr(rax, rax);
1088 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1089
1090 // return if everything seems ok
1091 __ bind(exit);
1092 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1093 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1094 __ pop(c_rarg3); // restore c_rarg3
1095 __ pop(c_rarg2); // restore c_rarg2
1096 __ pop(r12); // restore r12
1097 __ popf(); // restore flags
1098 __ ret(4 * wordSize); // pop caller saved stuff
1099
1100 // handle errors
1101 __ bind(error);
1102 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1103 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1104 __ pop(c_rarg3); // get saved c_rarg3 back
1105 __ pop(c_rarg2); // get saved c_rarg2 back
1106 __ pop(r12); // get saved r12 back
1107 __ popf(); // get saved flags off stack --
1108 // will be ignored
1109
1110 __ pusha(); // push registers
1111 // (rip is already
1112 // already pushed)
1113 // debug(char* msg, int64_t pc, int64_t regs[])
1114 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1115 // pushed all the registers, so now the stack looks like:
1116 // [tos + 0] 16 saved registers
1117 // [tos + 16] return address
1118 // * [tos + 17] error message (char*)
1119 // * [tos + 18] object to verify (oop)
1120 // * [tos + 19] saved rax - saved by caller and bashed
1121 // * [tos + 20] saved r10 (rscratch1) - saved by caller
1122 // * = popped on exit
1123
1124 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
1125 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
1126 __ movq(c_rarg2, rsp); // pass address of regs on stack
1127 __ mov(r12, rsp); // remember rsp
1128 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1129 __ andptr(rsp, -16); // align stack as required by ABI
1130 BLOCK_COMMENT("call MacroAssembler::debug");
1131 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1132 __ mov(rsp, r12); // restore rsp
1133 __ popa(); // pop registers (includes r12)
1134 __ ret(4 * wordSize); // pop caller saved stuff
1135
1136 return start;
1137 }
1138
1139 //
1140 // Verify that a register contains clean 32-bits positive value
1141 // (high 32-bits are 0) so it could be used in 64-bits shifts.
1142 //
1143 // Input:
1144 // Rint - 32-bits value
1145 // Rtmp - scratch
1146 //
1147 void assert_clean_int(Register Rint, Register Rtmp) {
1148#ifdef ASSERT
1149 Label L;
1150 assert_different_registers(Rtmp, Rint);
1151 __ movslq(Rtmp, Rint);
1152 __ cmpq(Rtmp, Rint);
1153 __ jcc(Assembler::equal, L);
1154 __ stop("high 32-bits of int value are not 0");
1155 __ bind(L);
1156#endif
1157 }
1158
1159 // Generate overlap test for array copy stubs
1160 //
1161 // Input:
1162 // c_rarg0 - from
1163 // c_rarg1 - to
1164 // c_rarg2 - element count
1165 //
1166 // Output:
1167 // rax - &from[element count - 1]
1168 //
1169 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1170 assert(no_overlap_target != NULL, "must be generated");
1171 array_overlap_test(no_overlap_target, NULL, sf);
1172 }
1173 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1174 array_overlap_test(NULL, &L_no_overlap, sf);
1175 }
1176 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1177 const Register from = c_rarg0;
1178 const Register to = c_rarg1;
1179 const Register count = c_rarg2;
1180 const Register end_from = rax;
1181
1182 __ cmpptr(to, from);
1183 __ lea(end_from, Address(from, count, sf, 0));
1184 if (NOLp == NULL) {
1185 ExternalAddress no_overlap(no_overlap_target);
1186 __ jump_cc(Assembler::belowEqual, no_overlap);
1187 __ cmpptr(to, end_from);
1188 __ jump_cc(Assembler::aboveEqual, no_overlap);
1189 } else {
1190 __ jcc(Assembler::belowEqual, (*NOLp));
1191 __ cmpptr(to, end_from);
1192 __ jcc(Assembler::aboveEqual, (*NOLp));
1193 }
1194 }
1195
1196 // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1197 //
1198 // Outputs:
1199 // rdi - rcx
1200 // rsi - rdx
1201 // rdx - r8
1202 // rcx - r9
1203 //
1204 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1205 // are non-volatile. r9 and r10 should not be used by the caller.
1206 //
1207 DEBUG_ONLY(bool regs_in_thread;)
1208
1209 void setup_arg_regs(int nargs = 3) {
1210 const Register saved_rdi = r9;
1211 const Register saved_rsi = r10;
1212 assert(nargs == 3 || nargs == 4, "else fix");
1213#ifdef _WIN64
1214 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1215 "unexpected argument registers");
1216 if (nargs >= 4)
1217 __ mov(rax, r9); // r9 is also saved_rdi
1218 __ movptr(saved_rdi, rdi);
1219 __ movptr(saved_rsi, rsi);
1220 __ mov(rdi, rcx); // c_rarg0
1221 __ mov(rsi, rdx); // c_rarg1
1222 __ mov(rdx, r8); // c_rarg2
1223 if (nargs >= 4)
1224 __ mov(rcx, rax); // c_rarg3 (via rax)
1225#else
1226 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1227 "unexpected argument registers");
1228#endif
1229 DEBUG_ONLY(regs_in_thread = false;)
1230 }
1231
1232 void restore_arg_regs() {
1233 assert(!regs_in_thread, "wrong call to restore_arg_regs");
1234 const Register saved_rdi = r9;
1235 const Register saved_rsi = r10;
1236#ifdef _WIN64
1237 __ movptr(rdi, saved_rdi);
1238 __ movptr(rsi, saved_rsi);
1239#endif
1240 }
1241
1242 // This is used in places where r10 is a scratch register, and can
1243 // be adapted if r9 is needed also.
1244 void setup_arg_regs_using_thread() {
1245 const Register saved_r15 = r9;
1246#ifdef _WIN64
1247 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
1248 __ get_thread(r15_thread);
1249 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1250 "unexpected argument registers");
1251 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1252 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1253
1254 __ mov(rdi, rcx); // c_rarg0
1255 __ mov(rsi, rdx); // c_rarg1
1256 __ mov(rdx, r8); // c_rarg2
1257#else
1258 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1259 "unexpected argument registers");
1260#endif
1261 DEBUG_ONLY(regs_in_thread = true;)
1262 }
1263
1264 void restore_arg_regs_using_thread() {
1265 assert(regs_in_thread, "wrong call to restore_arg_regs");
1266 const Register saved_r15 = r9;
1267#ifdef _WIN64
1268 __ get_thread(r15_thread);
1269 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1270 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1271 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored
1272#endif
1273 }
1274
1275 // Copy big chunks forward
1276 //
1277 // Inputs:
1278 // end_from - source arrays end address
1279 // end_to - destination array end address
1280 // qword_count - 64-bits element count, negative
1281 // to - scratch
1282 // L_copy_bytes - entry label
1283 // L_copy_8_bytes - exit label
1284 //
1285 void copy_bytes_forward(Register end_from, Register end_to,
1286 Register qword_count, Register to,
1287 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1288 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1289 Label L_loop;
1290 __ align(OptoLoopAlignment);
1291 if (UseUnalignedLoadStores) {
1292 Label L_end;
1293 // Copy 64-bytes per iteration
1294 __ BIND(L_loop);
1295 if (UseAVX > 2) {
1296 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1297 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1298 } else if (UseAVX == 2) {
1299 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1300 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1301 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1302 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1303 } else {
1304 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1305 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1306 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1307 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1308 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1309 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1310 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1311 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1312 }
1313 __ BIND(L_copy_bytes);
1314 __ addptr(qword_count, 8);
1315 __ jcc(Assembler::lessEqual, L_loop);
1316 __ subptr(qword_count, 4); // sub(8) and add(4)
1317 __ jccb(Assembler::greater, L_end);
1318 // Copy trailing 32 bytes
1319 if (UseAVX >= 2) {
1320 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1321 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1322 } else {
1323 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1324 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1325 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1326 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1327 }
1328 __ addptr(qword_count, 4);
1329 __ BIND(L_end);
1330 if (UseAVX >= 2) {
1331 // clean upper bits of YMM registers
1332 __ vpxor(xmm0, xmm0);
1333 __ vpxor(xmm1, xmm1);
1334 }
1335 } else {
1336 // Copy 32-bytes per iteration
1337 __ BIND(L_loop);
1338 __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1339 __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1340 __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1341 __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1342 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1343 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1344 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1345 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1346
1347 __ BIND(L_copy_bytes);
1348 __ addptr(qword_count, 4);
1349 __ jcc(Assembler::lessEqual, L_loop);
1350 }
1351 __ subptr(qword_count, 4);
1352 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1353 }
1354
1355 // Copy big chunks backward
1356 //
1357 // Inputs:
1358 // from - source arrays address
1359 // dest - destination array address
1360 // qword_count - 64-bits element count
1361 // to - scratch
1362 // L_copy_bytes - entry label
1363 // L_copy_8_bytes - exit label
1364 //
1365 void copy_bytes_backward(Register from, Register dest,
1366 Register qword_count, Register to,
1367 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1368 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1369 Label L_loop;
1370 __ align(OptoLoopAlignment);
1371 if (UseUnalignedLoadStores) {
1372 Label L_end;
1373 // Copy 64-bytes per iteration
1374 __ BIND(L_loop);
1375 if (UseAVX > 2) {
1376 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1377 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1378 } else if (UseAVX == 2) {
1379 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1380 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1381 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1382 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1383 } else {
1384 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1385 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1386 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1387 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1388 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1389 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1390 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
1391 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
1392 }
1393 __ BIND(L_copy_bytes);
1394 __ subptr(qword_count, 8);
1395 __ jcc(Assembler::greaterEqual, L_loop);
1396
1397 __ addptr(qword_count, 4); // add(8) and sub(4)
1398 __ jccb(Assembler::less, L_end);
1399 // Copy trailing 32 bytes
1400 if (UseAVX >= 2) {
1401 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1402 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1403 } else {
1404 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1405 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1406 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1407 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1408 }
1409 __ subptr(qword_count, 4);
1410 __ BIND(L_end);
1411 if (UseAVX >= 2) {
1412 // clean upper bits of YMM registers
1413 __ vpxor(xmm0, xmm0);
1414 __ vpxor(xmm1, xmm1);
1415 }
1416 } else {
1417 // Copy 32-bytes per iteration
1418 __ BIND(L_loop);
1419 __ movq(to, Address(from, qword_count, Address::times_8, 24));
1420 __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1421 __ movq(to, Address(from, qword_count, Address::times_8, 16));
1422 __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1423 __ movq(to, Address(from, qword_count, Address::times_8, 8));
1424 __ movq(Address(dest, qword_count, Address::times_8, 8), to);
1425 __ movq(to, Address(from, qword_count, Address::times_8, 0));
1426 __ movq(Address(dest, qword_count, Address::times_8, 0), to);
1427
1428 __ BIND(L_copy_bytes);
1429 __ subptr(qword_count, 4);
1430 __ jcc(Assembler::greaterEqual, L_loop);
1431 }
1432 __ addptr(qword_count, 4);
1433 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1434 }
1435
1436
1437 // Arguments:
1438 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1439 // ignored
1440 // name - stub name string
1441 //
1442 // Inputs:
1443 // c_rarg0 - source array address
1444 // c_rarg1 - destination array address
1445 // c_rarg2 - element count, treated as ssize_t, can be zero
1446 //
1447 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1448 // we let the hardware handle it. The one to eight bytes within words,
1449 // dwords or qwords that span cache line boundaries will still be loaded
1450 // and stored atomically.
1451 //
1452 // Side Effects:
1453 // disjoint_byte_copy_entry is set to the no-overlap entry point
1454 // used by generate_conjoint_byte_copy().
1455 //
1456 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1457 __ align(CodeEntryAlignment);
1458 StubCodeMark mark(this, "StubRoutines", name);
1459 address start = __ pc();
1460
1461 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1462 Label L_copy_byte, L_exit;
1463 const Register from = rdi; // source array address
1464 const Register to = rsi; // destination array address
1465 const Register count = rdx; // elements count
1466 const Register byte_count = rcx;
1467 const Register qword_count = count;
1468 const Register end_from = from; // source array end address
1469 const Register end_to = to; // destination array end address
1470 // End pointers are inclusive, and if count is not zero they point
1471 // to the last unit copied: end_to[0] := end_from[0]
1472
1473 __ enter(); // required for proper stackwalking of RuntimeStub frame
1474 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1475
1476 if (entry != NULL) {
1477 *entry = __ pc();
1478 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1479 BLOCK_COMMENT("Entry:");
1480 }
1481
1482 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1483 // r9 and r10 may be used to save non-volatile registers
1484
1485 // 'from', 'to' and 'count' are now valid
1486 __ movptr(byte_count, count);
1487 __ shrptr(count, 3); // count => qword_count
1488
1489 // Copy from low to high addresses. Use 'to' as scratch.
1490 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1491 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1492 __ negptr(qword_count); // make the count negative
1493 __ jmp(L_copy_bytes);
1494
1495 // Copy trailing qwords
1496 __ BIND(L_copy_8_bytes);
1497 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1498 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1499 __ increment(qword_count);
1500 __ jcc(Assembler::notZero, L_copy_8_bytes);
1501
1502 // Check for and copy trailing dword
1503 __ BIND(L_copy_4_bytes);
1504 __ testl(byte_count, 4);
1505 __ jccb(Assembler::zero, L_copy_2_bytes);
1506 __ movl(rax, Address(end_from, 8));
1507 __ movl(Address(end_to, 8), rax);
1508
1509 __ addptr(end_from, 4);
1510 __ addptr(end_to, 4);
1511
1512 // Check for and copy trailing word
1513 __ BIND(L_copy_2_bytes);
1514 __ testl(byte_count, 2);
1515 __ jccb(Assembler::zero, L_copy_byte);
1516 __ movw(rax, Address(end_from, 8));
1517 __ movw(Address(end_to, 8), rax);
1518
1519 __ addptr(end_from, 2);
1520 __ addptr(end_to, 2);
1521
1522 // Check for and copy trailing byte
1523 __ BIND(L_copy_byte);
1524 __ testl(byte_count, 1);
1525 __ jccb(Assembler::zero, L_exit);
1526 __ movb(rax, Address(end_from, 8));
1527 __ movb(Address(end_to, 8), rax);
1528
1529 __ BIND(L_exit);
1530 restore_arg_regs();
1531 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1532 __ xorptr(rax, rax); // return 0
1533 __ vzeroupper();
1534 __ leave(); // required for proper stackwalking of RuntimeStub frame
1535 __ ret(0);
1536
1537 // Copy in multi-bytes chunks
1538 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1539 __ jmp(L_copy_4_bytes);
1540
1541 return start;
1542 }
1543
1544 // Arguments:
1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546 // ignored
1547 // name - stub name string
1548 //
1549 // Inputs:
1550 // c_rarg0 - source array address
1551 // c_rarg1 - destination array address
1552 // c_rarg2 - element count, treated as ssize_t, can be zero
1553 //
1554 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1555 // we let the hardware handle it. The one to eight bytes within words,
1556 // dwords or qwords that span cache line boundaries will still be loaded
1557 // and stored atomically.
1558 //
1559 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1560 address* entry, const char *name) {
1561 __ align(CodeEntryAlignment);
1562 StubCodeMark mark(this, "StubRoutines", name);
1563 address start = __ pc();
1564
1565 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1566 const Register from = rdi; // source array address
1567 const Register to = rsi; // destination array address
1568 const Register count = rdx; // elements count
1569 const Register byte_count = rcx;
1570 const Register qword_count = count;
1571
1572 __ enter(); // required for proper stackwalking of RuntimeStub frame
1573 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1574
1575 if (entry != NULL) {
1576 *entry = __ pc();
1577 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1578 BLOCK_COMMENT("Entry:");
1579 }
1580
1581 array_overlap_test(nooverlap_target, Address::times_1);
1582 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1583 // r9 and r10 may be used to save non-volatile registers
1584
1585 // 'from', 'to' and 'count' are now valid
1586 __ movptr(byte_count, count);
1587 __ shrptr(count, 3); // count => qword_count
1588
1589 // Copy from high to low addresses.
1590
1591 // Check for and copy trailing byte
1592 __ testl(byte_count, 1);
1593 __ jcc(Assembler::zero, L_copy_2_bytes);
1594 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1595 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1596 __ decrement(byte_count); // Adjust for possible trailing word
1597
1598 // Check for and copy trailing word
1599 __ BIND(L_copy_2_bytes);
1600 __ testl(byte_count, 2);
1601 __ jcc(Assembler::zero, L_copy_4_bytes);
1602 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1603 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1604
1605 // Check for and copy trailing dword
1606 __ BIND(L_copy_4_bytes);
1607 __ testl(byte_count, 4);
1608 __ jcc(Assembler::zero, L_copy_bytes);
1609 __ movl(rax, Address(from, qword_count, Address::times_8));
1610 __ movl(Address(to, qword_count, Address::times_8), rax);
1611 __ jmp(L_copy_bytes);
1612
1613 // Copy trailing qwords
1614 __ BIND(L_copy_8_bytes);
1615 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1616 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1617 __ decrement(qword_count);
1618 __ jcc(Assembler::notZero, L_copy_8_bytes);
1619
1620 restore_arg_regs();
1621 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1622 __ xorptr(rax, rax); // return 0
1623 __ vzeroupper();
1624 __ leave(); // required for proper stackwalking of RuntimeStub frame
1625 __ ret(0);
1626
1627 // Copy in multi-bytes chunks
1628 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1629
1630 restore_arg_regs();
1631 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1632 __ xorptr(rax, rax); // return 0
1633 __ vzeroupper();
1634 __ leave(); // required for proper stackwalking of RuntimeStub frame
1635 __ ret(0);
1636
1637 return start;
1638 }
1639
1640 // Arguments:
1641 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1642 // ignored
1643 // name - stub name string
1644 //
1645 // Inputs:
1646 // c_rarg0 - source array address
1647 // c_rarg1 - destination array address
1648 // c_rarg2 - element count, treated as ssize_t, can be zero
1649 //
1650 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1651 // let the hardware handle it. The two or four words within dwords
1652 // or qwords that span cache line boundaries will still be loaded
1653 // and stored atomically.
1654 //
1655 // Side Effects:
1656 // disjoint_short_copy_entry is set to the no-overlap entry point
1657 // used by generate_conjoint_short_copy().
1658 //
1659 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1660 __ align(CodeEntryAlignment);
1661 StubCodeMark mark(this, "StubRoutines", name);
1662 address start = __ pc();
1663
1664 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1665 const Register from = rdi; // source array address
1666 const Register to = rsi; // destination array address
1667 const Register count = rdx; // elements count
1668 const Register word_count = rcx;
1669 const Register qword_count = count;
1670 const Register end_from = from; // source array end address
1671 const Register end_to = to; // destination array end address
1672 // End pointers are inclusive, and if count is not zero they point
1673 // to the last unit copied: end_to[0] := end_from[0]
1674
1675 __ enter(); // required for proper stackwalking of RuntimeStub frame
1676 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1677
1678 if (entry != NULL) {
1679 *entry = __ pc();
1680 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1681 BLOCK_COMMENT("Entry:");
1682 }
1683
1684 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1685 // r9 and r10 may be used to save non-volatile registers
1686
1687 // 'from', 'to' and 'count' are now valid
1688 __ movptr(word_count, count);
1689 __ shrptr(count, 2); // count => qword_count
1690
1691 // Copy from low to high addresses. Use 'to' as scratch.
1692 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1693 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1694 __ negptr(qword_count);
1695 __ jmp(L_copy_bytes);
1696
1697 // Copy trailing qwords
1698 __ BIND(L_copy_8_bytes);
1699 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1700 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1701 __ increment(qword_count);
1702 __ jcc(Assembler::notZero, L_copy_8_bytes);
1703
1704 // Original 'dest' is trashed, so we can't use it as a
1705 // base register for a possible trailing word copy
1706
1707 // Check for and copy trailing dword
1708 __ BIND(L_copy_4_bytes);
1709 __ testl(word_count, 2);
1710 __ jccb(Assembler::zero, L_copy_2_bytes);
1711 __ movl(rax, Address(end_from, 8));
1712 __ movl(Address(end_to, 8), rax);
1713
1714 __ addptr(end_from, 4);
1715 __ addptr(end_to, 4);
1716
1717 // Check for and copy trailing word
1718 __ BIND(L_copy_2_bytes);
1719 __ testl(word_count, 1);
1720 __ jccb(Assembler::zero, L_exit);
1721 __ movw(rax, Address(end_from, 8));
1722 __ movw(Address(end_to, 8), rax);
1723
1724 __ BIND(L_exit);
1725 restore_arg_regs();
1726 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1727 __ xorptr(rax, rax); // return 0
1728 __ vzeroupper();
1729 __ leave(); // required for proper stackwalking of RuntimeStub frame
1730 __ ret(0);
1731
1732 // Copy in multi-bytes chunks
1733 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1734 __ jmp(L_copy_4_bytes);
1735
1736 return start;
1737 }
1738
1739 address generate_fill(BasicType t, bool aligned, const char *name) {
1740 __ align(CodeEntryAlignment);
1741 StubCodeMark mark(this, "StubRoutines", name);
1742 address start = __ pc();
1743
1744 BLOCK_COMMENT("Entry:");
1745
1746 const Register to = c_rarg0; // source array address
1747 const Register value = c_rarg1; // value
1748 const Register count = c_rarg2; // elements count
1749
1750 __ enter(); // required for proper stackwalking of RuntimeStub frame
1751
1752 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1753
1754 __ vzeroupper();
1755 __ leave(); // required for proper stackwalking of RuntimeStub frame
1756 __ ret(0);
1757 return start;
1758 }
1759
1760 // Arguments:
1761 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1762 // ignored
1763 // name - stub name string
1764 //
1765 // Inputs:
1766 // c_rarg0 - source array address
1767 // c_rarg1 - destination array address
1768 // c_rarg2 - element count, treated as ssize_t, can be zero
1769 //
1770 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1771 // let the hardware handle it. The two or four words within dwords
1772 // or qwords that span cache line boundaries will still be loaded
1773 // and stored atomically.
1774 //
1775 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1776 address *entry, const char *name) {
1777 __ align(CodeEntryAlignment);
1778 StubCodeMark mark(this, "StubRoutines", name);
1779 address start = __ pc();
1780
1781 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1782 const Register from = rdi; // source array address
1783 const Register to = rsi; // destination array address
1784 const Register count = rdx; // elements count
1785 const Register word_count = rcx;
1786 const Register qword_count = count;
1787
1788 __ enter(); // required for proper stackwalking of RuntimeStub frame
1789 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1790
1791 if (entry != NULL) {
1792 *entry = __ pc();
1793 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1794 BLOCK_COMMENT("Entry:");
1795 }
1796
1797 array_overlap_test(nooverlap_target, Address::times_2);
1798 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1799 // r9 and r10 may be used to save non-volatile registers
1800
1801 // 'from', 'to' and 'count' are now valid
1802 __ movptr(word_count, count);
1803 __ shrptr(count, 2); // count => qword_count
1804
1805 // Copy from high to low addresses. Use 'to' as scratch.
1806
1807 // Check for and copy trailing word
1808 __ testl(word_count, 1);
1809 __ jccb(Assembler::zero, L_copy_4_bytes);
1810 __ movw(rax, Address(from, word_count, Address::times_2, -2));
1811 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1812
1813 // Check for and copy trailing dword
1814 __ BIND(L_copy_4_bytes);
1815 __ testl(word_count, 2);
1816 __ jcc(Assembler::zero, L_copy_bytes);
1817 __ movl(rax, Address(from, qword_count, Address::times_8));
1818 __ movl(Address(to, qword_count, Address::times_8), rax);
1819 __ jmp(L_copy_bytes);
1820
1821 // Copy trailing qwords
1822 __ BIND(L_copy_8_bytes);
1823 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1824 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1825 __ decrement(qword_count);
1826 __ jcc(Assembler::notZero, L_copy_8_bytes);
1827
1828 restore_arg_regs();
1829 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1830 __ xorptr(rax, rax); // return 0
1831 __ vzeroupper();
1832 __ leave(); // required for proper stackwalking of RuntimeStub frame
1833 __ ret(0);
1834
1835 // Copy in multi-bytes chunks
1836 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1837
1838 restore_arg_regs();
1839 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1840 __ xorptr(rax, rax); // return 0
1841 __ vzeroupper();
1842 __ leave(); // required for proper stackwalking of RuntimeStub frame
1843 __ ret(0);
1844
1845 return start;
1846 }
1847
1848 // Arguments:
1849 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1850 // ignored
1851 // is_oop - true => oop array, so generate store check code
1852 // name - stub name string
1853 //
1854 // Inputs:
1855 // c_rarg0 - source array address
1856 // c_rarg1 - destination array address
1857 // c_rarg2 - element count, treated as ssize_t, can be zero
1858 //
1859 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1860 // the hardware handle it. The two dwords within qwords that span
1861 // cache line boundaries will still be loaded and stored atomicly.
1862 //
1863 // Side Effects:
1864 // disjoint_int_copy_entry is set to the no-overlap entry point
1865 // used by generate_conjoint_int_oop_copy().
1866 //
1867 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1868 const char *name, bool dest_uninitialized = false) {
1869 __ align(CodeEntryAlignment);
1870 StubCodeMark mark(this, "StubRoutines", name);
1871 address start = __ pc();
1872
1873 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1874 const Register from = rdi; // source array address
1875 const Register to = rsi; // destination array address
1876 const Register count = rdx; // elements count
1877 const Register dword_count = rcx;
1878 const Register qword_count = count;
1879 const Register end_from = from; // source array end address
1880 const Register end_to = to; // destination array end address
1881 // End pointers are inclusive, and if count is not zero they point
1882 // to the last unit copied: end_to[0] := end_from[0]
1883
1884 __ enter(); // required for proper stackwalking of RuntimeStub frame
1885 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1886
1887 if (entry != NULL) {
1888 *entry = __ pc();
1889 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1890 BLOCK_COMMENT("Entry:");
1891 }
1892
1893 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1894 // r9 is used to save r15_thread
1895
1896 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1897 if (dest_uninitialized) {
1898 decorators |= IS_DEST_UNINITIALIZED;
1899 }
1900 if (aligned) {
1901 decorators |= ARRAYCOPY_ALIGNED;
1902 }
1903
1904 BasicType type = is_oop ? T_OBJECT : T_INT;
1905 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1906 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1907
1908 // 'from', 'to' and 'count' are now valid
1909 __ movptr(dword_count, count);
1910 __ shrptr(count, 1); // count => qword_count
1911
1912 // Copy from low to high addresses. Use 'to' as scratch.
1913 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1914 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1915 __ negptr(qword_count);
1916 __ jmp(L_copy_bytes);
1917
1918 // Copy trailing qwords
1919 __ BIND(L_copy_8_bytes);
1920 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1921 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1922 __ increment(qword_count);
1923 __ jcc(Assembler::notZero, L_copy_8_bytes);
1924
1925 // Check for and copy trailing dword
1926 __ BIND(L_copy_4_bytes);
1927 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1928 __ jccb(Assembler::zero, L_exit);
1929 __ movl(rax, Address(end_from, 8));
1930 __ movl(Address(end_to, 8), rax);
1931
1932 __ BIND(L_exit);
1933 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1934 restore_arg_regs_using_thread();
1935 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1936 __ vzeroupper();
1937 __ xorptr(rax, rax); // return 0
1938 __ leave(); // required for proper stackwalking of RuntimeStub frame
1939 __ ret(0);
1940
1941 // Copy in multi-bytes chunks
1942 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1943 __ jmp(L_copy_4_bytes);
1944
1945 return start;
1946 }
1947
1948 // Arguments:
1949 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1950 // ignored
1951 // is_oop - true => oop array, so generate store check code
1952 // name - stub name string
1953 //
1954 // Inputs:
1955 // c_rarg0 - source array address
1956 // c_rarg1 - destination array address
1957 // c_rarg2 - element count, treated as ssize_t, can be zero
1958 //
1959 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1960 // the hardware handle it. The two dwords within qwords that span
1961 // cache line boundaries will still be loaded and stored atomicly.
1962 //
1963 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1964 address *entry, const char *name,
1965 bool dest_uninitialized = false) {
1966 __ align(CodeEntryAlignment);
1967 StubCodeMark mark(this, "StubRoutines", name);
1968 address start = __ pc();
1969
1970 Label L_copy_bytes, L_copy_8_bytes, L_exit;
1971 const Register from = rdi; // source array address
1972 const Register to = rsi; // destination array address
1973 const Register count = rdx; // elements count
1974 const Register dword_count = rcx;
1975 const Register qword_count = count;
1976
1977 __ enter(); // required for proper stackwalking of RuntimeStub frame
1978 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1979
1980 if (entry != NULL) {
1981 *entry = __ pc();
1982 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1983 BLOCK_COMMENT("Entry:");
1984 }
1985
1986 array_overlap_test(nooverlap_target, Address::times_4);
1987 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1988 // r9 is used to save r15_thread
1989
1990 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1991 if (dest_uninitialized) {
1992 decorators |= IS_DEST_UNINITIALIZED;
1993 }
1994 if (aligned) {
1995 decorators |= ARRAYCOPY_ALIGNED;
1996 }
1997
1998 BasicType type = is_oop ? T_OBJECT : T_INT;
1999 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2000 // no registers are destroyed by this call
2001 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2002
2003 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2004 // 'from', 'to' and 'count' are now valid
2005 __ movptr(dword_count, count);
2006 __ shrptr(count, 1); // count => qword_count
2007
2008 // Copy from high to low addresses. Use 'to' as scratch.
2009
2010 // Check for and copy trailing dword
2011 __ testl(dword_count, 1);
2012 __ jcc(Assembler::zero, L_copy_bytes);
2013 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2014 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2015 __ jmp(L_copy_bytes);
2016
2017 // Copy trailing qwords
2018 __ BIND(L_copy_8_bytes);
2019 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2020 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2021 __ decrement(qword_count);
2022 __ jcc(Assembler::notZero, L_copy_8_bytes);
2023
2024 if (is_oop) {
2025 __ jmp(L_exit);
2026 }
2027 restore_arg_regs_using_thread();
2028 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2029 __ xorptr(rax, rax); // return 0
2030 __ vzeroupper();
2031 __ leave(); // required for proper stackwalking of RuntimeStub frame
2032 __ ret(0);
2033
2034 // Copy in multi-bytes chunks
2035 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2036
2037 __ BIND(L_exit);
2038 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2039 restore_arg_regs_using_thread();
2040 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2041 __ xorptr(rax, rax); // return 0
2042 __ vzeroupper();
2043 __ leave(); // required for proper stackwalking of RuntimeStub frame
2044 __ ret(0);
2045
2046 return start;
2047 }
2048
2049 // Arguments:
2050 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2051 // ignored
2052 // is_oop - true => oop array, so generate store check code
2053 // name - stub name string
2054 //
2055 // Inputs:
2056 // c_rarg0 - source array address
2057 // c_rarg1 - destination array address
2058 // c_rarg2 - element count, treated as ssize_t, can be zero
2059 //
2060 // Side Effects:
2061 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2062 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2063 //
2064 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2065 const char *name, bool dest_uninitialized = false) {
2066 __ align(CodeEntryAlignment);
2067 StubCodeMark mark(this, "StubRoutines", name);
2068 address start = __ pc();
2069
2070 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2071 const Register from = rdi; // source array address
2072 const Register to = rsi; // destination array address
2073 const Register qword_count = rdx; // elements count
2074 const Register end_from = from; // source array end address
2075 const Register end_to = rcx; // destination array end address
2076 const Register saved_count = r11;
2077 // End pointers are inclusive, and if count is not zero they point
2078 // to the last unit copied: end_to[0] := end_from[0]
2079
2080 __ enter(); // required for proper stackwalking of RuntimeStub frame
2081 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2082 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2083
2084 if (entry != NULL) {
2085 *entry = __ pc();
2086 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2087 BLOCK_COMMENT("Entry:");
2088 }
2089
2090 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2091 // r9 is used to save r15_thread
2092 // 'from', 'to' and 'qword_count' are now valid
2093
2094 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2095 if (dest_uninitialized) {
2096 decorators |= IS_DEST_UNINITIALIZED;
2097 }
2098 if (aligned) {
2099 decorators |= ARRAYCOPY_ALIGNED;
2100 }
2101
2102 BasicType type = is_oop ? T_OBJECT : T_LONG;
2103 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2104 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2105
2106 // Copy from low to high addresses. Use 'to' as scratch.
2107 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2108 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2109 __ negptr(qword_count);
2110 __ jmp(L_copy_bytes);
2111
2112 // Copy trailing qwords
2113 __ BIND(L_copy_8_bytes);
2114 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2115 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2116 __ increment(qword_count);
2117 __ jcc(Assembler::notZero, L_copy_8_bytes);
2118
2119 if (is_oop) {
2120 __ jmp(L_exit);
2121 } else {
2122 restore_arg_regs_using_thread();
2123 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2124 __ xorptr(rax, rax); // return 0
2125 __ vzeroupper();
2126 __ leave(); // required for proper stackwalking of RuntimeStub frame
2127 __ ret(0);
2128 }
2129
2130 // Copy in multi-bytes chunks
2131 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2132
2133 __ BIND(L_exit);
2134 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2135 restore_arg_regs_using_thread();
2136 if (is_oop) {
2137 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2138 } else {
2139 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2140 }
2141 __ vzeroupper();
2142 __ xorptr(rax, rax); // return 0
2143 __ leave(); // required for proper stackwalking of RuntimeStub frame
2144 __ ret(0);
2145
2146 return start;
2147 }
2148
2149 // Arguments:
2150 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2151 // ignored
2152 // is_oop - true => oop array, so generate store check code
2153 // name - stub name string
2154 //
2155 // Inputs:
2156 // c_rarg0 - source array address
2157 // c_rarg1 - destination array address
2158 // c_rarg2 - element count, treated as ssize_t, can be zero
2159 //
2160 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2161 address nooverlap_target, address *entry,
2162 const char *name, bool dest_uninitialized = false) {
2163 __ align(CodeEntryAlignment);
2164 StubCodeMark mark(this, "StubRoutines", name);
2165 address start = __ pc();
2166
2167 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2168 const Register from = rdi; // source array address
2169 const Register to = rsi; // destination array address
2170 const Register qword_count = rdx; // elements count
2171 const Register saved_count = rcx;
2172
2173 __ enter(); // required for proper stackwalking of RuntimeStub frame
2174 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2175
2176 if (entry != NULL) {
2177 *entry = __ pc();
2178 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2179 BLOCK_COMMENT("Entry:");
2180 }
2181
2182 array_overlap_test(nooverlap_target, Address::times_8);
2183 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2184 // r9 is used to save r15_thread
2185 // 'from', 'to' and 'qword_count' are now valid
2186
2187 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2188 if (dest_uninitialized) {
2189 decorators |= IS_DEST_UNINITIALIZED;
2190 }
2191 if (aligned) {
2192 decorators |= ARRAYCOPY_ALIGNED;
2193 }
2194
2195 BasicType type = is_oop ? T_OBJECT : T_LONG;
2196 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2197 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2198
2199 __ jmp(L_copy_bytes);
2200
2201 // Copy trailing qwords
2202 __ BIND(L_copy_8_bytes);
2203 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2204 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2205 __ decrement(qword_count);
2206 __ jcc(Assembler::notZero, L_copy_8_bytes);
2207
2208 if (is_oop) {
2209 __ jmp(L_exit);
2210 } else {
2211 restore_arg_regs_using_thread();
2212 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2213 __ xorptr(rax, rax); // return 0
2214 __ vzeroupper();
2215 __ leave(); // required for proper stackwalking of RuntimeStub frame
2216 __ ret(0);
2217 }
2218
2219 // Copy in multi-bytes chunks
2220 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2221
2222 __ BIND(L_exit);
2223 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2224 restore_arg_regs_using_thread();
2225 if (is_oop) {
2226 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2227 } else {
2228 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2229 }
2230 __ vzeroupper();
2231 __ xorptr(rax, rax); // return 0
2232 __ leave(); // required for proper stackwalking of RuntimeStub frame
2233 __ ret(0);
2234
2235 return start;
2236 }
2237
2238
2239 // Helper for generating a dynamic type check.
2240 // Smashes no registers.
2241 void generate_type_check(Register sub_klass,
2242 Register super_check_offset,
2243 Register super_klass,
2244 Label& L_success) {
2245 assert_different_registers(sub_klass, super_check_offset, super_klass);
2246
2247 BLOCK_COMMENT("type_check:");
2248
2249 Label L_miss;
2250
2251 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
2252 super_check_offset);
2253 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2254
2255 // Fall through on failure!
2256 __ BIND(L_miss);
2257 }
2258
2259 //
2260 // Generate checkcasting array copy stub
2261 //
2262 // Input:
2263 // c_rarg0 - source array address
2264 // c_rarg1 - destination array address
2265 // c_rarg2 - element count, treated as ssize_t, can be zero
2266 // c_rarg3 - size_t ckoff (super_check_offset)
2267 // not Win64
2268 // c_rarg4 - oop ckval (super_klass)
2269 // Win64
2270 // rsp+40 - oop ckval (super_klass)
2271 //
2272 // Output:
2273 // rax == 0 - success
2274 // rax == -1^K - failure, where K is partial transfer count
2275 //
2276 address generate_checkcast_copy(const char *name, address *entry,
2277 bool dest_uninitialized = false) {
2278
2279 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2280
2281 // Input registers (after setup_arg_regs)
2282 const Register from = rdi; // source array address
2283 const Register to = rsi; // destination array address
2284 const Register length = rdx; // elements count
2285 const Register ckoff = rcx; // super_check_offset
2286 const Register ckval = r8; // super_klass
2287
2288 // Registers used as temps (r13, r14 are save-on-entry)
2289 const Register end_from = from; // source array end address
2290 const Register end_to = r13; // destination array end address
2291 const Register count = rdx; // -(count_remaining)
2292 const Register r14_length = r14; // saved copy of length
2293 // End pointers are inclusive, and if length is not zero they point
2294 // to the last unit copied: end_to[0] := end_from[0]
2295
2296 const Register rax_oop = rax; // actual oop copied
2297 const Register r11_klass = r11; // oop._klass
2298
2299 //---------------------------------------------------------------
2300 // Assembler stub will be used for this call to arraycopy
2301 // if the two arrays are subtypes of Object[] but the
2302 // destination array type is not equal to or a supertype
2303 // of the source type. Each element must be separately
2304 // checked.
2305
2306 __ align(CodeEntryAlignment);
2307 StubCodeMark mark(this, "StubRoutines", name);
2308 address start = __ pc();
2309
2310 __ enter(); // required for proper stackwalking of RuntimeStub frame
2311
2312#ifdef ASSERT
2313 // caller guarantees that the arrays really are different
2314 // otherwise, we would have to make conjoint checks
2315 { Label L;
2316 array_overlap_test(L, TIMES_OOP);
2317 __ stop("checkcast_copy within a single array");
2318 __ bind(L);
2319 }
2320#endif //ASSERT
2321
2322 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2323 // ckoff => rcx, ckval => r8
2324 // r9 and r10 may be used to save non-volatile registers
2325#ifdef _WIN64
2326 // last argument (#4) is on stack on Win64
2327 __ movptr(ckval, Address(rsp, 6 * wordSize));
2328#endif
2329
2330 // Caller of this entry point must set up the argument registers.
2331 if (entry != NULL) {
2332 *entry = __ pc();
2333 BLOCK_COMMENT("Entry:");
2334 }
2335
2336 // allocate spill slots for r13, r14
2337 enum {
2338 saved_r13_offset,
2339 saved_r14_offset,
2340 saved_r10_offset,
2341 saved_rbp_offset
2342 };
2343 __ subptr(rsp, saved_rbp_offset * wordSize);
2344 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2345 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2346 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2347
2348#ifdef ASSERT
2349 Label L2;
2350 __ get_thread(r14);
2351 __ cmpptr(r15_thread, r14);
2352 __ jcc(Assembler::equal, L2);
2353 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2354 __ bind(L2);
2355#endif // ASSERT
2356
2357 // check that int operands are properly extended to size_t
2358 assert_clean_int(length, rax);
2359 assert_clean_int(ckoff, rax);
2360
2361#ifdef ASSERT
2362 BLOCK_COMMENT("assert consistent ckoff/ckval");
2363 // The ckoff and ckval must be mutually consistent,
2364 // even though caller generates both.
2365 { Label L;
2366 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2367 __ cmpl(ckoff, Address(ckval, sco_offset));
2368 __ jcc(Assembler::equal, L);
2369 __ stop("super_check_offset inconsistent");
2370 __ bind(L);
2371 }
2372#endif //ASSERT
2373
2374 // Loop-invariant addresses. They are exclusive end pointers.
2375 Address end_from_addr(from, length, TIMES_OOP, 0);
2376 Address end_to_addr(to, length, TIMES_OOP, 0);
2377 // Loop-variant addresses. They assume post-incremented count < 0.
2378 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2379 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2380
2381 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2382 if (dest_uninitialized) {
2383 decorators |= IS_DEST_UNINITIALIZED;
2384 }
2385
2386 BasicType type = T_OBJECT;
2387 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2388 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2389
2390 // Copy from low to high addresses, indexed from the end of each array.
2391 __ lea(end_from, end_from_addr);
2392 __ lea(end_to, end_to_addr);
2393 __ movptr(r14_length, length); // save a copy of the length
2394 assert(length == count, ""); // else fix next line:
2395 __ negptr(count); // negate and test the length
2396 __ jcc(Assembler::notZero, L_load_element);
2397
2398 // Empty array: Nothing to do.
2399 __ xorptr(rax, rax); // return 0 on (trivial) success
2400 __ jmp(L_done);
2401
2402 // ======== begin loop ========
2403 // (Loop is rotated; its entry is L_load_element.)
2404 // Loop control:
2405 // for (count = -count; count != 0; count++)
2406 // Base pointers src, dst are biased by 8*(count-1),to last element.
2407 __ align(OptoLoopAlignment);
2408
2409 __ BIND(L_store_element);
2410 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop
2411 __ increment(count); // increment the count toward zero
2412 __ jcc(Assembler::zero, L_do_card_marks);
2413
2414 // ======== loop entry is here ========
2415 __ BIND(L_load_element);
2416 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2417 __ testptr(rax_oop, rax_oop);
2418 __ jcc(Assembler::zero, L_store_element);
2419
2420 __ load_klass(r11_klass, rax_oop);// query the object klass
2421 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2422 // ======== end loop ========
2423
2424 // It was a real error; we must depend on the caller to finish the job.
2425 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2426 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2427 // and report their number to the caller.
2428 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2429 Label L_post_barrier;
2430 __ addptr(r14_length, count); // K = (original - remaining) oops
2431 __ movptr(rax, r14_length); // save the value
2432 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2433 __ jccb(Assembler::notZero, L_post_barrier);
2434 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2435
2436 // Come here on success only.
2437 __ BIND(L_do_card_marks);
2438 __ xorptr(rax, rax); // return 0 on success
2439
2440 __ BIND(L_post_barrier);
2441 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2442
2443 // Common exit point (success or failure).
2444 __ BIND(L_done);
2445 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2446 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2447 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2448 restore_arg_regs();
2449 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2450 __ leave(); // required for proper stackwalking of RuntimeStub frame
2451 __ ret(0);
2452
2453 return start;
2454 }
2455
2456 //
2457 // Generate 'unsafe' array copy stub
2458 // Though just as safe as the other stubs, it takes an unscaled
2459 // size_t argument instead of an element count.
2460 //
2461 // Input:
2462 // c_rarg0 - source array address
2463 // c_rarg1 - destination array address
2464 // c_rarg2 - byte count, treated as ssize_t, can be zero
2465 //
2466 // Examines the alignment of the operands and dispatches
2467 // to a long, int, short, or byte copy loop.
2468 //
2469 address generate_unsafe_copy(const char *name,
2470 address byte_copy_entry, address short_copy_entry,
2471 address int_copy_entry, address long_copy_entry) {
2472
2473 Label L_long_aligned, L_int_aligned, L_short_aligned;
2474
2475 // Input registers (before setup_arg_regs)
2476 const Register from = c_rarg0; // source array address
2477 const Register to = c_rarg1; // destination array address
2478 const Register size = c_rarg2; // byte count (size_t)
2479
2480 // Register used as a temp
2481 const Register bits = rax; // test copy of low bits
2482
2483 __ align(CodeEntryAlignment);
2484 StubCodeMark mark(this, "StubRoutines", name);
2485 address start = __ pc();
2486
2487 __ enter(); // required for proper stackwalking of RuntimeStub frame
2488
2489 // bump this on entry, not on exit:
2490 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2491
2492 __ mov(bits, from);
2493 __ orptr(bits, to);
2494 __ orptr(bits, size);
2495
2496 __ testb(bits, BytesPerLong-1);
2497 __ jccb(Assembler::zero, L_long_aligned);
2498
2499 __ testb(bits, BytesPerInt-1);
2500 __ jccb(Assembler::zero, L_int_aligned);
2501
2502 __ testb(bits, BytesPerShort-1);
2503 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2504
2505 __ BIND(L_short_aligned);
2506 __ shrptr(size, LogBytesPerShort); // size => short_count
2507 __ jump(RuntimeAddress(short_copy_entry));
2508
2509 __ BIND(L_int_aligned);
2510 __ shrptr(size, LogBytesPerInt); // size => int_count
2511 __ jump(RuntimeAddress(int_copy_entry));
2512
2513 __ BIND(L_long_aligned);
2514 __ shrptr(size, LogBytesPerLong); // size => qword_count
2515 __ jump(RuntimeAddress(long_copy_entry));
2516
2517 return start;
2518 }
2519
2520 // Perform range checks on the proposed arraycopy.
2521 // Kills temp, but nothing else.
2522 // Also, clean the sign bits of src_pos and dst_pos.
2523 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2524 Register src_pos, // source position (c_rarg1)
2525 Register dst, // destination array oo (c_rarg2)
2526 Register dst_pos, // destination position (c_rarg3)
2527 Register length,
2528 Register temp,
2529 Label& L_failed) {
2530 BLOCK_COMMENT("arraycopy_range_checks:");
2531
2532 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2533 __ movl(temp, length);
2534 __ addl(temp, src_pos); // src_pos + length
2535 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2536 __ jcc(Assembler::above, L_failed);
2537
2538 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2539 __ movl(temp, length);
2540 __ addl(temp, dst_pos); // dst_pos + length
2541 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2542 __ jcc(Assembler::above, L_failed);
2543
2544 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2545 // Move with sign extension can be used since they are positive.
2546 __ movslq(src_pos, src_pos);
2547 __ movslq(dst_pos, dst_pos);
2548
2549 BLOCK_COMMENT("arraycopy_range_checks done");
2550 }
2551
2552 //
2553 // Generate generic array copy stubs
2554 //
2555 // Input:
2556 // c_rarg0 - src oop
2557 // c_rarg1 - src_pos (32-bits)
2558 // c_rarg2 - dst oop
2559 // c_rarg3 - dst_pos (32-bits)
2560 // not Win64
2561 // c_rarg4 - element count (32-bits)
2562 // Win64
2563 // rsp+40 - element count (32-bits)
2564 //
2565 // Output:
2566 // rax == 0 - success
2567 // rax == -1^K - failure, where K is partial transfer count
2568 //
2569 address generate_generic_copy(const char *name,
2570 address byte_copy_entry, address short_copy_entry,
2571 address int_copy_entry, address oop_copy_entry,
2572 address long_copy_entry, address checkcast_copy_entry) {
2573
2574 Label L_failed, L_failed_0, L_objArray;
2575 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2576
2577 // Input registers
2578 const Register src = c_rarg0; // source array oop
2579 const Register src_pos = c_rarg1; // source position
2580 const Register dst = c_rarg2; // destination array oop
2581 const Register dst_pos = c_rarg3; // destination position
2582#ifndef _WIN64
2583 const Register length = c_rarg4;
2584#else
2585 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64
2586#endif
2587
2588 { int modulus = CodeEntryAlignment;
2589 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2590 int advance = target - (__ offset() % modulus);
2591 if (advance < 0) advance += modulus;
2592 if (advance > 0) __ nop(advance);
2593 }
2594 StubCodeMark mark(this, "StubRoutines", name);
2595
2596 // Short-hop target to L_failed. Makes for denser prologue code.
2597 __ BIND(L_failed_0);
2598 __ jmp(L_failed);
2599 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2600
2601 __ align(CodeEntryAlignment);
2602 address start = __ pc();
2603
2604 __ enter(); // required for proper stackwalking of RuntimeStub frame
2605
2606 // bump this on entry, not on exit:
2607 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2608
2609 //-----------------------------------------------------------------------
2610 // Assembler stub will be used for this call to arraycopy
2611 // if the following conditions are met:
2612 //
2613 // (1) src and dst must not be null.
2614 // (2) src_pos must not be negative.
2615 // (3) dst_pos must not be negative.
2616 // (4) length must not be negative.
2617 // (5) src klass and dst klass should be the same and not NULL.
2618 // (6) src and dst should be arrays.
2619 // (7) src_pos + length must not exceed length of src.
2620 // (8) dst_pos + length must not exceed length of dst.
2621 //
2622
2623 // if (src == NULL) return -1;
2624 __ testptr(src, src); // src oop
2625 size_t j1off = __ offset();
2626 __ jccb(Assembler::zero, L_failed_0);
2627
2628 // if (src_pos < 0) return -1;
2629 __ testl(src_pos, src_pos); // src_pos (32-bits)
2630 __ jccb(Assembler::negative, L_failed_0);
2631
2632 // if (dst == NULL) return -1;
2633 __ testptr(dst, dst); // dst oop
2634 __ jccb(Assembler::zero, L_failed_0);
2635
2636 // if (dst_pos < 0) return -1;
2637 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2638 size_t j4off = __ offset();
2639 __ jccb(Assembler::negative, L_failed_0);
2640
2641 // The first four tests are very dense code,
2642 // but not quite dense enough to put four
2643 // jumps in a 16-byte instruction fetch buffer.
2644 // That's good, because some branch predicters
2645 // do not like jumps so close together.
2646 // Make sure of this.
2647 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2648
2649 // registers used as temp
2650 const Register r11_length = r11; // elements count to copy
2651 const Register r10_src_klass = r10; // array klass
2652
2653 // if (length < 0) return -1;
2654 __ movl(r11_length, length); // length (elements count, 32-bits value)
2655 __ testl(r11_length, r11_length);
2656 __ jccb(Assembler::negative, L_failed_0);
2657
2658 __ load_klass(r10_src_klass, src);
2659#ifdef ASSERT
2660 // assert(src->klass() != NULL);
2661 {
2662 BLOCK_COMMENT("assert klasses not null {");
2663 Label L1, L2;
2664 __ testptr(r10_src_klass, r10_src_klass);
2665 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2666 __ bind(L1);
2667 __ stop("broken null klass");
2668 __ bind(L2);
2669 __ load_klass(rax, dst);
2670 __ cmpq(rax, 0);
2671 __ jcc(Assembler::equal, L1); // this would be broken also
2672 BLOCK_COMMENT("} assert klasses not null done");
2673 }
2674#endif
2675
2676 // Load layout helper (32-bits)
2677 //
2678 // |array_tag| | header_size | element_type | |log2_element_size|
2679 // 32 30 24 16 8 2 0
2680 //
2681 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2682 //
2683
2684 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2685
2686 // Handle objArrays completely differently...
2687 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2688 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2689 __ jcc(Assembler::equal, L_objArray);
2690
2691 // if (src->klass() != dst->klass()) return -1;
2692 __ load_klass(rax, dst);
2693 __ cmpq(r10_src_klass, rax);
2694 __ jcc(Assembler::notEqual, L_failed);
2695
2696 const Register rax_lh = rax; // layout helper
2697 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2698
2699 // if (!src->is_Array()) return -1;
2700 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2701 __ jcc(Assembler::greaterEqual, L_failed);
2702
2703 // At this point, it is known to be a typeArray (array_tag 0x3).
2704#ifdef ASSERT
2705 {
2706 BLOCK_COMMENT("assert primitive array {");
2707 Label L;
2708 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2709 __ jcc(Assembler::greaterEqual, L);
2710 __ stop("must be a primitive array");
2711 __ bind(L);
2712 BLOCK_COMMENT("} assert primitive array done");
2713 }
2714#endif
2715
2716 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2717 r10, L_failed);
2718
2719 // TypeArrayKlass
2720 //
2721 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2722 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2723 //
2724
2725 const Register r10_offset = r10; // array offset
2726 const Register rax_elsize = rax_lh; // element size
2727
2728 __ movl(r10_offset, rax_lh);
2729 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2730 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2731 __ addptr(src, r10_offset); // src array offset
2732 __ addptr(dst, r10_offset); // dst array offset
2733 BLOCK_COMMENT("choose copy loop based on element size");
2734 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2735
2736 // next registers should be set before the jump to corresponding stub
2737 const Register from = c_rarg0; // source array address
2738 const Register to = c_rarg1; // destination array address
2739 const Register count = c_rarg2; // elements count
2740
2741 // 'from', 'to', 'count' registers should be set in such order
2742 // since they are the same as 'src', 'src_pos', 'dst'.
2743
2744 __ BIND(L_copy_bytes);
2745 __ cmpl(rax_elsize, 0);
2746 __ jccb(Assembler::notEqual, L_copy_shorts);
2747 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2748 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2749 __ movl2ptr(count, r11_length); // length
2750 __ jump(RuntimeAddress(byte_copy_entry));
2751
2752 __ BIND(L_copy_shorts);
2753 __ cmpl(rax_elsize, LogBytesPerShort);
2754 __ jccb(Assembler::notEqual, L_copy_ints);
2755 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2756 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2757 __ movl2ptr(count, r11_length); // length
2758 __ jump(RuntimeAddress(short_copy_entry));
2759
2760 __ BIND(L_copy_ints);
2761 __ cmpl(rax_elsize, LogBytesPerInt);
2762 __ jccb(Assembler::notEqual, L_copy_longs);
2763 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2764 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2765 __ movl2ptr(count, r11_length); // length
2766 __ jump(RuntimeAddress(int_copy_entry));
2767
2768 __ BIND(L_copy_longs);
2769#ifdef ASSERT
2770 {
2771 BLOCK_COMMENT("assert long copy {");
2772 Label L;
2773 __ cmpl(rax_elsize, LogBytesPerLong);
2774 __ jcc(Assembler::equal, L);
2775 __ stop("must be long copy, but elsize is wrong");
2776 __ bind(L);
2777 BLOCK_COMMENT("} assert long copy done");
2778 }
2779#endif
2780 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2781 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2782 __ movl2ptr(count, r11_length); // length
2783 __ jump(RuntimeAddress(long_copy_entry));
2784
2785 // ObjArrayKlass
2786 __ BIND(L_objArray);
2787 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
2788
2789 Label L_plain_copy, L_checkcast_copy;
2790 // test array classes for subtyping
2791 __ load_klass(rax, dst);
2792 __ cmpq(r10_src_klass, rax); // usual case is exact equality
2793 __ jcc(Assembler::notEqual, L_checkcast_copy);
2794
2795 // Identically typed arrays can be copied without element-wise checks.
2796 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2797 r10, L_failed);
2798
2799 __ lea(from, Address(src, src_pos, TIMES_OOP,
2800 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2801 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2802 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2803 __ movl2ptr(count, r11_length); // length
2804 __ BIND(L_plain_copy);
2805 __ jump(RuntimeAddress(oop_copy_entry));
2806
2807 __ BIND(L_checkcast_copy);
2808 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
2809 {
2810 // Before looking at dst.length, make sure dst is also an objArray.
2811 __ cmpl(Address(rax, lh_offset), objArray_lh);
2812 __ jcc(Assembler::notEqual, L_failed);
2813
2814 // It is safe to examine both src.length and dst.length.
2815 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2816 rax, L_failed);
2817
2818 const Register r11_dst_klass = r11;
2819 __ load_klass(r11_dst_klass, dst); // reload
2820
2821 // Marshal the base address arguments now, freeing registers.
2822 __ lea(from, Address(src, src_pos, TIMES_OOP,
2823 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2824 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2825 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2826 __ movl(count, length); // length (reloaded)
2827 Register sco_temp = c_rarg3; // this register is free now
2828 assert_different_registers(from, to, count, sco_temp,
2829 r11_dst_klass, r10_src_klass);
2830 assert_clean_int(count, sco_temp);
2831
2832 // Generate the type check.
2833 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2834 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2835 assert_clean_int(sco_temp, rax);
2836 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2837
2838 // Fetch destination element klass from the ObjArrayKlass header.
2839 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2840 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2841 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
2842 assert_clean_int(sco_temp, rax);
2843
2844 // the checkcast_copy loop needs two extra arguments:
2845 assert(c_rarg3 == sco_temp, "#3 already in place");
2846 // Set up arguments for checkcast_copy_entry.
2847 setup_arg_regs(4);
2848 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2849 __ jump(RuntimeAddress(checkcast_copy_entry));
2850 }
2851
2852 __ BIND(L_failed);
2853 __ xorptr(rax, rax);
2854 __ notptr(rax); // return -1
2855 __ leave(); // required for proper stackwalking of RuntimeStub frame
2856 __ ret(0);
2857
2858 return start;
2859 }
2860
2861 void generate_arraycopy_stubs() {
2862 address entry;
2863 address entry_jbyte_arraycopy;
2864 address entry_jshort_arraycopy;
2865 address entry_jint_arraycopy;
2866 address entry_oop_arraycopy;
2867 address entry_jlong_arraycopy;
2868 address entry_checkcast_arraycopy;
2869
2870 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2871 "jbyte_disjoint_arraycopy");
2872 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2873 "jbyte_arraycopy");
2874
2875 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2876 "jshort_disjoint_arraycopy");
2877 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2878 "jshort_arraycopy");
2879
2880 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry,
2881 "jint_disjoint_arraycopy");
2882 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry,
2883 &entry_jint_arraycopy, "jint_arraycopy");
2884
2885 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry,
2886 "jlong_disjoint_arraycopy");
2887 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry,
2888 &entry_jlong_arraycopy, "jlong_arraycopy");
2889
2890
2891 if (UseCompressedOops) {
2892 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry,
2893 "oop_disjoint_arraycopy");
2894 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry,
2895 &entry_oop_arraycopy, "oop_arraycopy");
2896 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry,
2897 "oop_disjoint_arraycopy_uninit",
2898 /*dest_uninitialized*/true);
2899 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry,
2900 NULL, "oop_arraycopy_uninit",
2901 /*dest_uninitialized*/true);
2902 } else {
2903 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry,
2904 "oop_disjoint_arraycopy");
2905 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry,
2906 &entry_oop_arraycopy, "oop_arraycopy");
2907 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry,
2908 "oop_disjoint_arraycopy_uninit",
2909 /*dest_uninitialized*/true);
2910 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry,
2911 NULL, "oop_arraycopy_uninit",
2912 /*dest_uninitialized*/true);
2913 }
2914
2915 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2916 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2917 /*dest_uninitialized*/true);
2918
2919 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
2920 entry_jbyte_arraycopy,
2921 entry_jshort_arraycopy,
2922 entry_jint_arraycopy,
2923 entry_jlong_arraycopy);
2924 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
2925 entry_jbyte_arraycopy,
2926 entry_jshort_arraycopy,
2927 entry_jint_arraycopy,
2928 entry_oop_arraycopy,
2929 entry_jlong_arraycopy,
2930 entry_checkcast_arraycopy);
2931
2932 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2933 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2934 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2935 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2936 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2937 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2938
2939 // We don't generate specialized code for HeapWord-aligned source
2940 // arrays, so just use the code we've already generated
2941 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
2942 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
2943
2944 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2945 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
2946
2947 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
2948 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
2949
2950 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
2951 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
2952
2953 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
2954 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
2955
2956 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2957 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2958 }
2959
2960 // AES intrinsic stubs
2961 enum {AESBlockSize = 16};
2962
2963 address generate_key_shuffle_mask() {
2964 __ align(16);
2965 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2966 address start = __ pc();
2967 __ emit_data64( 0x0405060700010203, relocInfo::none );
2968 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2969 return start;
2970 }
2971
2972 address generate_counter_shuffle_mask() {
2973 __ align(16);
2974 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2975 address start = __ pc();
2976 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
2977 __ emit_data64(0x0001020304050607, relocInfo::none);
2978 return start;
2979 }
2980
2981 // Utility routine for loading a 128-bit key word in little endian format
2982 // can optionally specify that the shuffle mask is already in an xmmregister
2983 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2984 __ movdqu(xmmdst, Address(key, offset));
2985 if (xmm_shuf_mask != NULL) {
2986 __ pshufb(xmmdst, xmm_shuf_mask);
2987 } else {
2988 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2989 }
2990 }
2991
2992 // Utility routine for increase 128bit counter (iv in CTR mode)
2993 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2994 __ pextrq(reg, xmmdst, 0x0);
2995 __ addq(reg, inc_delta);
2996 __ pinsrq(xmmdst, reg, 0x0);
2997 __ jcc(Assembler::carryClear, next_block); // jump if no carry
2998 __ pextrq(reg, xmmdst, 0x01); // Carry
2999 __ addq(reg, 0x01);
3000 __ pinsrq(xmmdst, reg, 0x01); //Carry end
3001 __ BIND(next_block); // next instruction
3002 }
3003
3004 // Arguments:
3005 //
3006 // Inputs:
3007 // c_rarg0 - source byte array address
3008 // c_rarg1 - destination byte array address
3009 // c_rarg2 - K (key) in little endian int array
3010 //
3011 address generate_aescrypt_encryptBlock() {
3012 assert(UseAES, "need AES instructions and misaligned SSE support");
3013 __ align(CodeEntryAlignment);
3014 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3015 Label L_doLast;
3016 address start = __ pc();
3017
3018 const Register from = c_rarg0; // source array address
3019 const Register to = c_rarg1; // destination array address
3020 const Register key = c_rarg2; // key array address
3021 const Register keylen = rax;
3022
3023 const XMMRegister xmm_result = xmm0;
3024 const XMMRegister xmm_key_shuf_mask = xmm1;
3025 // On win64 xmm6-xmm15 must be preserved so don't use them.
3026 const XMMRegister xmm_temp1 = xmm2;
3027 const XMMRegister xmm_temp2 = xmm3;
3028 const XMMRegister xmm_temp3 = xmm4;
3029 const XMMRegister xmm_temp4 = xmm5;
3030
3031 __ enter(); // required for proper stackwalking of RuntimeStub frame
3032
3033 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3034 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3035
3036 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3037 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3038
3039 // For encryption, the java expanded key ordering is just what we need
3040 // we don't know if the key is aligned, hence not using load-execute form
3041
3042 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3043 __ pxor(xmm_result, xmm_temp1);
3044
3045 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3046 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3047 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3048 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3049
3050 __ aesenc(xmm_result, xmm_temp1);
3051 __ aesenc(xmm_result, xmm_temp2);
3052 __ aesenc(xmm_result, xmm_temp3);
3053 __ aesenc(xmm_result, xmm_temp4);
3054
3055 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3056 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3057 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3058 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3059
3060 __ aesenc(xmm_result, xmm_temp1);
3061 __ aesenc(xmm_result, xmm_temp2);
3062 __ aesenc(xmm_result, xmm_temp3);
3063 __ aesenc(xmm_result, xmm_temp4);
3064
3065 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3066 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3067
3068 __ cmpl(keylen, 44);
3069 __ jccb(Assembler::equal, L_doLast);
3070
3071 __ aesenc(xmm_result, xmm_temp1);
3072 __ aesenc(xmm_result, xmm_temp2);
3073
3074 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3075 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3076
3077 __ cmpl(keylen, 52);
3078 __ jccb(Assembler::equal, L_doLast);
3079
3080 __ aesenc(xmm_result, xmm_temp1);
3081 __ aesenc(xmm_result, xmm_temp2);
3082
3083 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3084 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3085
3086 __ BIND(L_doLast);
3087 __ aesenc(xmm_result, xmm_temp1);
3088 __ aesenclast(xmm_result, xmm_temp2);
3089 __ movdqu(Address(to, 0), xmm_result); // store the result
3090 __ xorptr(rax, rax); // return 0
3091 __ leave(); // required for proper stackwalking of RuntimeStub frame
3092 __ ret(0);
3093
3094 return start;
3095 }
3096
3097
3098 // Arguments:
3099 //
3100 // Inputs:
3101 // c_rarg0 - source byte array address
3102 // c_rarg1 - destination byte array address
3103 // c_rarg2 - K (key) in little endian int array
3104 //
3105 address generate_aescrypt_decryptBlock() {
3106 assert(UseAES, "need AES instructions and misaligned SSE support");
3107 __ align(CodeEntryAlignment);
3108 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3109 Label L_doLast;
3110 address start = __ pc();
3111
3112 const Register from = c_rarg0; // source array address
3113 const Register to = c_rarg1; // destination array address
3114 const Register key = c_rarg2; // key array address
3115 const Register keylen = rax;
3116
3117 const XMMRegister xmm_result = xmm0;
3118 const XMMRegister xmm_key_shuf_mask = xmm1;
3119 // On win64 xmm6-xmm15 must be preserved so don't use them.
3120 const XMMRegister xmm_temp1 = xmm2;
3121 const XMMRegister xmm_temp2 = xmm3;
3122 const XMMRegister xmm_temp3 = xmm4;
3123 const XMMRegister xmm_temp4 = xmm5;
3124
3125 __ enter(); // required for proper stackwalking of RuntimeStub frame
3126
3127 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3128 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3129
3130 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3131 __ movdqu(xmm_result, Address(from, 0));
3132
3133 // for decryption java expanded key ordering is rotated one position from what we want
3134 // so we start from 0x10 here and hit 0x00 last
3135 // we don't know if the key is aligned, hence not using load-execute form
3136 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3137 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3138 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3139 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3140
3141 __ pxor (xmm_result, xmm_temp1);
3142 __ aesdec(xmm_result, xmm_temp2);
3143 __ aesdec(xmm_result, xmm_temp3);
3144 __ aesdec(xmm_result, xmm_temp4);
3145
3146 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3147 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3148 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3149 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3150
3151 __ aesdec(xmm_result, xmm_temp1);
3152 __ aesdec(xmm_result, xmm_temp2);
3153 __ aesdec(xmm_result, xmm_temp3);
3154 __ aesdec(xmm_result, xmm_temp4);
3155
3156 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3157 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3158 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3159
3160 __ cmpl(keylen, 44);
3161 __ jccb(Assembler::equal, L_doLast);
3162
3163 __ aesdec(xmm_result, xmm_temp1);
3164 __ aesdec(xmm_result, xmm_temp2);
3165
3166 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3167 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3168
3169 __ cmpl(keylen, 52);
3170 __ jccb(Assembler::equal, L_doLast);
3171
3172 __ aesdec(xmm_result, xmm_temp1);
3173 __ aesdec(xmm_result, xmm_temp2);
3174
3175 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3176 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3177
3178 __ BIND(L_doLast);
3179 __ aesdec(xmm_result, xmm_temp1);
3180 __ aesdec(xmm_result, xmm_temp2);
3181
3182 // for decryption the aesdeclast operation is always on key+0x00
3183 __ aesdeclast(xmm_result, xmm_temp3);
3184 __ movdqu(Address(to, 0), xmm_result); // store the result
3185 __ xorptr(rax, rax); // return 0
3186 __ leave(); // required for proper stackwalking of RuntimeStub frame
3187 __ ret(0);
3188
3189 return start;
3190 }
3191
3192
3193 // Arguments:
3194 //
3195 // Inputs:
3196 // c_rarg0 - source byte array address
3197 // c_rarg1 - destination byte array address
3198 // c_rarg2 - K (key) in little endian int array
3199 // c_rarg3 - r vector byte array address
3200 // c_rarg4 - input length
3201 //
3202 // Output:
3203 // rax - input length
3204 //
3205 address generate_cipherBlockChaining_encryptAESCrypt() {
3206 assert(UseAES, "need AES instructions and misaligned SSE support");
3207 __ align(CodeEntryAlignment);
3208 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3209 address start = __ pc();
3210
3211 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3212 const Register from = c_rarg0; // source array address
3213 const Register to = c_rarg1; // destination array address
3214 const Register key = c_rarg2; // key array address
3215 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3216 // and left with the results of the last encryption block
3217#ifndef _WIN64
3218 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3219#else
3220 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3221 const Register len_reg = r11; // pick the volatile windows register
3222#endif
3223 const Register pos = rax;
3224
3225 // xmm register assignments for the loops below
3226 const XMMRegister xmm_result = xmm0;
3227 const XMMRegister xmm_temp = xmm1;
3228 // keys 0-10 preloaded into xmm2-xmm12
3229 const int XMM_REG_NUM_KEY_FIRST = 2;
3230 const int XMM_REG_NUM_KEY_LAST = 15;
3231 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3232 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3233 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3234 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3235 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3236
3237 __ enter(); // required for proper stackwalking of RuntimeStub frame
3238
3239#ifdef _WIN64
3240 // on win64, fill len_reg from stack position
3241 __ movl(len_reg, len_mem);
3242#else
3243 __ push(len_reg); // Save
3244#endif
3245
3246 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3247 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3248 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3249 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3250 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3251 offset += 0x10;
3252 }
3253 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3254
3255 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3256 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3257 __ cmpl(rax, 44);
3258 __ jcc(Assembler::notEqual, L_key_192_256);
3259
3260 // 128 bit code follows here
3261 __ movptr(pos, 0);
3262 __ align(OptoLoopAlignment);
3263
3264 __ BIND(L_loopTop_128);
3265 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3266 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3267 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3268 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3269 __ aesenc(xmm_result, as_XMMRegister(rnum));
3270 }
3271 __ aesenclast(xmm_result, xmm_key10);
3272 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3273 // no need to store r to memory until we exit
3274 __ addptr(pos, AESBlockSize);
3275 __ subptr(len_reg, AESBlockSize);
3276 __ jcc(Assembler::notEqual, L_loopTop_128);
3277
3278 __ BIND(L_exit);
3279 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3280
3281#ifdef _WIN64
3282 __ movl(rax, len_mem);
3283#else
3284 __ pop(rax); // return length
3285#endif
3286 __ leave(); // required for proper stackwalking of RuntimeStub frame
3287 __ ret(0);
3288
3289 __ BIND(L_key_192_256);
3290 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3291 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3292 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3293 __ cmpl(rax, 52);
3294 __ jcc(Assembler::notEqual, L_key_256);
3295
3296 // 192-bit code follows here (could be changed to use more xmm registers)
3297 __ movptr(pos, 0);
3298 __ align(OptoLoopAlignment);
3299
3300 __ BIND(L_loopTop_192);
3301 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3302 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3303 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3304 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3305 __ aesenc(xmm_result, as_XMMRegister(rnum));
3306 }
3307 __ aesenclast(xmm_result, xmm_key12);
3308 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3309 // no need to store r to memory until we exit
3310 __ addptr(pos, AESBlockSize);
3311 __ subptr(len_reg, AESBlockSize);
3312 __ jcc(Assembler::notEqual, L_loopTop_192);
3313 __ jmp(L_exit);
3314
3315 __ BIND(L_key_256);
3316 // 256-bit code follows here (could be changed to use more xmm registers)
3317 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3318 __ movptr(pos, 0);
3319 __ align(OptoLoopAlignment);
3320
3321 __ BIND(L_loopTop_256);
3322 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3323 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3324 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3325 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3326 __ aesenc(xmm_result, as_XMMRegister(rnum));
3327 }
3328 load_key(xmm_temp, key, 0xe0);
3329 __ aesenclast(xmm_result, xmm_temp);
3330 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3331 // no need to store r to memory until we exit
3332 __ addptr(pos, AESBlockSize);
3333 __ subptr(len_reg, AESBlockSize);
3334 __ jcc(Assembler::notEqual, L_loopTop_256);
3335 __ jmp(L_exit);
3336
3337 return start;
3338 }
3339
3340 // Safefetch stubs.
3341 void generate_safefetch(const char* name, int size, address* entry,
3342 address* fault_pc, address* continuation_pc) {
3343 // safefetch signatures:
3344 // int SafeFetch32(int* adr, int errValue);
3345 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3346 //
3347 // arguments:
3348 // c_rarg0 = adr
3349 // c_rarg1 = errValue
3350 //
3351 // result:
3352 // PPC_RET = *adr or errValue
3353
3354 StubCodeMark mark(this, "StubRoutines", name);
3355
3356 // Entry point, pc or function descriptor.
3357 *entry = __ pc();
3358
3359 // Load *adr into c_rarg1, may fault.
3360 *fault_pc = __ pc();
3361 switch (size) {
3362 case 4:
3363 // int32_t
3364 __ movl(c_rarg1, Address(c_rarg0, 0));
3365 break;
3366 case 8:
3367 // int64_t
3368 __ movq(c_rarg1, Address(c_rarg0, 0));
3369 break;
3370 default:
3371 ShouldNotReachHere();
3372 }
3373
3374 // return errValue or *adr
3375 *continuation_pc = __ pc();
3376 __ movq(rax, c_rarg1);
3377 __ ret(0);
3378 }
3379
3380 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3381 // to hide instruction latency
3382 //
3383 // Arguments:
3384 //
3385 // Inputs:
3386 // c_rarg0 - source byte array address
3387 // c_rarg1 - destination byte array address
3388 // c_rarg2 - K (key) in little endian int array
3389 // c_rarg3 - r vector byte array address
3390 // c_rarg4 - input length
3391 //
3392 // Output:
3393 // rax - input length
3394 //
3395 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3396 assert(UseAES, "need AES instructions and misaligned SSE support");
3397 __ align(CodeEntryAlignment);
3398 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3399 address start = __ pc();
3400
3401 const Register from = c_rarg0; // source array address
3402 const Register to = c_rarg1; // destination array address
3403 const Register key = c_rarg2; // key array address
3404 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3405 // and left with the results of the last encryption block
3406#ifndef _WIN64
3407 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3408#else
3409 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3410 const Register len_reg = r11; // pick the volatile windows register
3411#endif
3412 const Register pos = rax;
3413
3414 const int PARALLEL_FACTOR = 4;
3415 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3416
3417 Label L_exit;
3418 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3419 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3420 Label L_singleBlock_loopTop[3]; // 128, 192, 256
3421 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3422 Label L_multiBlock_loopTop[3]; // 128, 192, 256
3423
3424 // keys 0-10 preloaded into xmm5-xmm15
3425 const int XMM_REG_NUM_KEY_FIRST = 5;
3426 const int XMM_REG_NUM_KEY_LAST = 15;
3427 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3428 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3429
3430 __ enter(); // required for proper stackwalking of RuntimeStub frame
3431
3432#ifdef _WIN64
3433 // on win64, fill len_reg from stack position
3434 __ movl(len_reg, len_mem);
3435#else
3436 __ push(len_reg); // Save
3437#endif
3438 __ push(rbx);
3439 // the java expanded key ordering is rotated one position from what we want
3440 // so we start from 0x10 here and hit 0x00 last
3441 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3442 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3443 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3444 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3445 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3446 offset += 0x10;
3447 }
3448 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3449
3450 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3451
3452 // registers holding the four results in the parallelized loop
3453 const XMMRegister xmm_result0 = xmm0;
3454 const XMMRegister xmm_result1 = xmm2;
3455 const XMMRegister xmm_result2 = xmm3;
3456 const XMMRegister xmm_result3 = xmm4;
3457
3458 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3459
3460 __ xorptr(pos, pos);
3461
3462 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3463 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3464 __ cmpl(rbx, 52);
3465 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3466 __ cmpl(rbx, 60);
3467 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3468
3469#define DoFour(opc, src_reg) \
3470 __ opc(xmm_result0, src_reg); \
3471 __ opc(xmm_result1, src_reg); \
3472 __ opc(xmm_result2, src_reg); \
3473 __ opc(xmm_result3, src_reg); \
3474
3475 for (int k = 0; k < 3; ++k) {
3476 __ BIND(L_multiBlock_loopTopHead[k]);
3477 if (k != 0) {
3478 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3479 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3480 }
3481 if (k == 1) {
3482 __ subptr(rsp, 6 * wordSize);
3483 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3484 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3485 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3486 load_key(xmm1, key, 0xc0); // 0xc0;
3487 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3488 } else if (k == 2) {
3489 __ subptr(rsp, 10 * wordSize);
3490 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3491 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3492 __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3493 load_key(xmm1, key, 0xe0); // 0xe0;
3494 __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3495 load_key(xmm15, key, 0xb0); // 0xb0;
3496 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3497 load_key(xmm1, key, 0xc0); // 0xc0;
3498 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3499 }
3500 __ align(OptoLoopAlignment);
3501 __ BIND(L_multiBlock_loopTop[k]);
3502 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3503 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3504
3505 if (k != 0) {
3506 __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3507 __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3508 }
3509
3510 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3511 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3512 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3513 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3514
3515 DoFour(pxor, xmm_key_first);
3516 if (k == 0) {
3517 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3518 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3519 }
3520 DoFour(aesdeclast, xmm_key_last);
3521 } else if (k == 1) {
3522 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3523 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3524 }
3525 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3526 DoFour(aesdec, xmm1); // key : 0xc0
3527 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3528 DoFour(aesdeclast, xmm_key_last);
3529 } else if (k == 2) {
3530 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3531 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3532 }
3533 DoFour(aesdec, xmm1); // key : 0xc0
3534 __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3535 __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3536 DoFour(aesdec, xmm15); // key : 0xd0
3537 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3538 DoFour(aesdec, xmm1); // key : 0xe0
3539 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3540 DoFour(aesdeclast, xmm_key_last);
3541 }
3542
3543 // for each result, xor with the r vector of previous cipher block
3544 __ pxor(xmm_result0, xmm_prev_block_cipher);
3545 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3546 __ pxor(xmm_result1, xmm_prev_block_cipher);
3547 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3548 __ pxor(xmm_result2, xmm_prev_block_cipher);
3549 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3550 __ pxor(xmm_result3, xmm_prev_block_cipher);
3551 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
3552 if (k != 0) {
3553 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3554 }
3555
3556 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
3557 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3558 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3559 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3560
3561 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3562 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3563 __ jmp(L_multiBlock_loopTop[k]);
3564
3565 // registers used in the non-parallelized loops
3566 // xmm register assignments for the loops below
3567 const XMMRegister xmm_result = xmm0;
3568 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3569 const XMMRegister xmm_key11 = xmm3;
3570 const XMMRegister xmm_key12 = xmm4;
3571 const XMMRegister key_tmp = xmm4;
3572
3573 __ BIND(L_singleBlock_loopTopHead[k]);
3574 if (k == 1) {
3575 __ addptr(rsp, 6 * wordSize);
3576 } else if (k == 2) {
3577 __ addptr(rsp, 10 * wordSize);
3578 }
3579 __ cmpptr(len_reg, 0); // any blocks left??
3580 __ jcc(Assembler::equal, L_exit);
3581 __ BIND(L_singleBlock_loopTopHead2[k]);
3582 if (k == 1) {
3583 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3584 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3585 }
3586 if (k == 2) {
3587 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3588 }
3589 __ align(OptoLoopAlignment);
3590 __ BIND(L_singleBlock_loopTop[k]);
3591 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3592 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3593 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3594 for (int rnum = 1; rnum <= 9 ; rnum++) {
3595 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3596 }
3597 if (k == 1) {
3598 __ aesdec(xmm_result, xmm_key11);
3599 __ aesdec(xmm_result, xmm_key12);
3600 }
3601 if (k == 2) {
3602 __ aesdec(xmm_result, xmm_key11);
3603 load_key(key_tmp, key, 0xc0);
3604 __ aesdec(xmm_result, key_tmp);
3605 load_key(key_tmp, key, 0xd0);
3606 __ aesdec(xmm_result, key_tmp);
3607 load_key(key_tmp, key, 0xe0);
3608 __ aesdec(xmm_result, key_tmp);
3609 }
3610
3611 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3612 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3613 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3614 // no need to store r to memory until we exit
3615 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3616 __ addptr(pos, AESBlockSize);
3617 __ subptr(len_reg, AESBlockSize);
3618 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3619 if (k != 2) {
3620 __ jmp(L_exit);
3621 }
3622 } //for 128/192/256
3623
3624 __ BIND(L_exit);
3625 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3626 __ pop(rbx);
3627#ifdef _WIN64
3628 __ movl(rax, len_mem);
3629#else
3630 __ pop(rax); // return length
3631#endif
3632 __ leave(); // required for proper stackwalking of RuntimeStub frame
3633 __ ret(0);
3634 return start;
3635}
3636
3637 address generate_upper_word_mask() {
3638 __ align(64);
3639 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3640 address start = __ pc();
3641 __ emit_data64(0x0000000000000000, relocInfo::none);
3642 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3643 return start;
3644 }
3645
3646 address generate_shuffle_byte_flip_mask() {
3647 __ align(64);
3648 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3649 address start = __ pc();
3650 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3651 __ emit_data64(0x0001020304050607, relocInfo::none);
3652 return start;
3653 }
3654
3655 // ofs and limit are use for multi-block byte array.
3656 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3657 address generate_sha1_implCompress(bool multi_block, const char *name) {
3658 __ align(CodeEntryAlignment);
3659 StubCodeMark mark(this, "StubRoutines", name);
3660 address start = __ pc();
3661
3662 Register buf = c_rarg0;
3663 Register state = c_rarg1;
3664 Register ofs = c_rarg2;
3665 Register limit = c_rarg3;
3666
3667 const XMMRegister abcd = xmm0;
3668 const XMMRegister e0 = xmm1;
3669 const XMMRegister e1 = xmm2;
3670 const XMMRegister msg0 = xmm3;
3671
3672 const XMMRegister msg1 = xmm4;
3673 const XMMRegister msg2 = xmm5;
3674 const XMMRegister msg3 = xmm6;
3675 const XMMRegister shuf_mask = xmm7;
3676
3677 __ enter();
3678
3679 __ subptr(rsp, 4 * wordSize);
3680
3681 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3682 buf, state, ofs, limit, rsp, multi_block);
3683
3684 __ addptr(rsp, 4 * wordSize);
3685
3686 __ leave();
3687 __ ret(0);
3688 return start;
3689 }
3690
3691 address generate_pshuffle_byte_flip_mask() {
3692 __ align(64);
3693 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3694 address start = __ pc();
3695 __ emit_data64(0x0405060700010203, relocInfo::none);
3696 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3697
3698 if (VM_Version::supports_avx2()) {
3699 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3700 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3701 // _SHUF_00BA
3702 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3703 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3704 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3705 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3706 // _SHUF_DC00
3707 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3708 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3709 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3710 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3711 }
3712
3713 return start;
3714 }
3715
3716 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3717 address generate_pshuffle_byte_flip_mask_sha512() {
3718 __ align(32);
3719 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3720 address start = __ pc();
3721 if (VM_Version::supports_avx2()) {
3722 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3723 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3724 __ emit_data64(0x1011121314151617, relocInfo::none);
3725 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3726 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3727 __ emit_data64(0x0000000000000000, relocInfo::none);
3728 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3729 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3730 }
3731
3732 return start;
3733 }
3734
3735// ofs and limit are use for multi-block byte array.
3736// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3737 address generate_sha256_implCompress(bool multi_block, const char *name) {
3738 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3739 __ align(CodeEntryAlignment);
3740 StubCodeMark mark(this, "StubRoutines", name);
3741 address start = __ pc();
3742
3743 Register buf = c_rarg0;
3744 Register state = c_rarg1;
3745 Register ofs = c_rarg2;
3746 Register limit = c_rarg3;
3747
3748 const XMMRegister msg = xmm0;
3749 const XMMRegister state0 = xmm1;
3750 const XMMRegister state1 = xmm2;
3751 const XMMRegister msgtmp0 = xmm3;
3752
3753 const XMMRegister msgtmp1 = xmm4;
3754 const XMMRegister msgtmp2 = xmm5;
3755 const XMMRegister msgtmp3 = xmm6;
3756 const XMMRegister msgtmp4 = xmm7;
3757
3758 const XMMRegister shuf_mask = xmm8;
3759
3760 __ enter();
3761
3762 __ subptr(rsp, 4 * wordSize);
3763
3764 if (VM_Version::supports_sha()) {
3765 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3766 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3767 } else if (VM_Version::supports_avx2()) {
3768 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3769 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3770 }
3771 __ addptr(rsp, 4 * wordSize);
3772 __ vzeroupper();
3773 __ leave();
3774 __ ret(0);
3775 return start;
3776 }
3777
3778 address generate_sha512_implCompress(bool multi_block, const char *name) {
3779 assert(VM_Version::supports_avx2(), "");
3780 assert(VM_Version::supports_bmi2(), "");
3781 __ align(CodeEntryAlignment);
3782 StubCodeMark mark(this, "StubRoutines", name);
3783 address start = __ pc();
3784
3785 Register buf = c_rarg0;
3786 Register state = c_rarg1;
3787 Register ofs = c_rarg2;
3788 Register limit = c_rarg3;
3789
3790 const XMMRegister msg = xmm0;
3791 const XMMRegister state0 = xmm1;
3792 const XMMRegister state1 = xmm2;
3793 const XMMRegister msgtmp0 = xmm3;
3794 const XMMRegister msgtmp1 = xmm4;
3795 const XMMRegister msgtmp2 = xmm5;
3796 const XMMRegister msgtmp3 = xmm6;
3797 const XMMRegister msgtmp4 = xmm7;
3798
3799 const XMMRegister shuf_mask = xmm8;
3800
3801 __ enter();
3802
3803 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3804 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3805
3806 __ vzeroupper();
3807 __ leave();
3808 __ ret(0);
3809 return start;
3810 }
3811
3812 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3813 // to hide instruction latency
3814 //
3815 // Arguments:
3816 //
3817 // Inputs:
3818 // c_rarg0 - source byte array address
3819 // c_rarg1 - destination byte array address
3820 // c_rarg2 - K (key) in little endian int array
3821 // c_rarg3 - counter vector byte array address
3822 // Linux
3823 // c_rarg4 - input length
3824 // c_rarg5 - saved encryptedCounter start
3825 // rbp + 6 * wordSize - saved used length
3826 // Windows
3827 // rbp + 6 * wordSize - input length
3828 // rbp + 7 * wordSize - saved encryptedCounter start
3829 // rbp + 8 * wordSize - saved used length
3830 //
3831 // Output:
3832 // rax - input length
3833 //
3834 address generate_counterMode_AESCrypt_Parallel() {
3835 assert(UseAES, "need AES instructions and misaligned SSE support");
3836 __ align(CodeEntryAlignment);
3837 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3838 address start = __ pc();
3839 const Register from = c_rarg0; // source array address
3840 const Register to = c_rarg1; // destination array address
3841 const Register key = c_rarg2; // key array address
3842 const Register counter = c_rarg3; // counter byte array initialized from counter array address
3843 // and updated with the incremented counter in the end
3844#ifndef _WIN64
3845 const Register len_reg = c_rarg4;
3846 const Register saved_encCounter_start = c_rarg5;
3847 const Register used_addr = r10;
3848 const Address used_mem(rbp, 2 * wordSize);
3849 const Register used = r11;
3850#else
3851 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3852 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
3853 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
3854 const Register len_reg = r10; // pick the first volatile windows register
3855 const Register saved_encCounter_start = r11;
3856 const Register used_addr = r13;
3857 const Register used = r14;
3858#endif
3859 const Register pos = rax;
3860
3861 const int PARALLEL_FACTOR = 6;
3862 const XMMRegister xmm_counter_shuf_mask = xmm0;
3863 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3864 const XMMRegister xmm_curr_counter = xmm2;
3865
3866 const XMMRegister xmm_key_tmp0 = xmm3;
3867 const XMMRegister xmm_key_tmp1 = xmm4;
3868
3869 // registers holding the four results in the parallelized loop
3870 const XMMRegister xmm_result0 = xmm5;
3871 const XMMRegister xmm_result1 = xmm6;
3872 const XMMRegister xmm_result2 = xmm7;
3873 const XMMRegister xmm_result3 = xmm8;
3874 const XMMRegister xmm_result4 = xmm9;
3875 const XMMRegister xmm_result5 = xmm10;
3876
3877 const XMMRegister xmm_from0 = xmm11;
3878 const XMMRegister xmm_from1 = xmm12;
3879 const XMMRegister xmm_from2 = xmm13;
3880 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3881 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3882 const XMMRegister xmm_from5 = xmm4;
3883
3884 //for key_128, key_192, key_256
3885 const int rounds[3] = {10, 12, 14};
3886 Label L_exit_preLoop, L_preLoop_start;
3887 Label L_multiBlock_loopTop[3];
3888 Label L_singleBlockLoopTop[3];
3889 Label L__incCounter[3][6]; //for 6 blocks
3890 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3891 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3892 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3893
3894 Label L_exit;
3895
3896 __ enter(); // required for proper stackwalking of RuntimeStub frame
3897
3898#ifdef _WIN64
3899 // allocate spill slots for r13, r14
3900 enum {
3901 saved_r13_offset,
3902 saved_r14_offset
3903 };
3904 __ subptr(rsp, 2 * wordSize);
3905 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3906 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3907
3908 // on win64, fill len_reg from stack position
3909 __ movl(len_reg, len_mem);
3910 __ movptr(saved_encCounter_start, saved_encCounter_mem);
3911 __ movptr(used_addr, used_mem);
3912 __ movl(used, Address(used_addr, 0));
3913#else
3914 __ push(len_reg); // Save
3915 __ movptr(used_addr, used_mem);
3916 __ movl(used, Address(used_addr, 0));
3917#endif
3918
3919 __ push(rbx); // Save RBX
3920 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3921 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
3922 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3923 __ movptr(pos, 0);
3924
3925 // Use the partially used encrpyted counter from last invocation
3926 __ BIND(L_preLoop_start);
3927 __ cmpptr(used, 16);
3928 __ jcc(Assembler::aboveEqual, L_exit_preLoop);
3929 __ cmpptr(len_reg, 0);
3930 __ jcc(Assembler::lessEqual, L_exit_preLoop);
3931 __ movb(rbx, Address(saved_encCounter_start, used));
3932 __ xorb(rbx, Address(from, pos));
3933 __ movb(Address(to, pos), rbx);
3934 __ addptr(pos, 1);
3935 __ addptr(used, 1);
3936 __ subptr(len_reg, 1);
3937
3938 __ jmp(L_preLoop_start);
3939
3940 __ BIND(L_exit_preLoop);
3941 __ movl(Address(used_addr, 0), used);
3942
3943 // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3944 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
3945 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3946 __ cmpl(rbx, 52);
3947 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
3948 __ cmpl(rbx, 60);
3949 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
3950
3951#define CTR_DoSix(opc, src_reg) \
3952 __ opc(xmm_result0, src_reg); \
3953 __ opc(xmm_result1, src_reg); \
3954 __ opc(xmm_result2, src_reg); \
3955 __ opc(xmm_result3, src_reg); \
3956 __ opc(xmm_result4, src_reg); \
3957 __ opc(xmm_result5, src_reg);
3958
3959 // k == 0 : generate code for key_128
3960 // k == 1 : generate code for key_192
3961 // k == 2 : generate code for key_256
3962 for (int k = 0; k < 3; ++k) {
3963 //multi blocks starts here
3964 __ align(OptoLoopAlignment);
3965 __ BIND(L_multiBlock_loopTop[k]);
3966 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3967 __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3968 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
3969
3970 //load, then increase counters
3971 CTR_DoSix(movdqa, xmm_curr_counter);
3972 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
3973 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
3974 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
3975 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
3976 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
3977 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
3978 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3979 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
3980
3981 //load two ROUND_KEYs at a time
3982 for (int i = 1; i < rounds[k]; ) {
3983 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
3984 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
3985 CTR_DoSix(aesenc, xmm_key_tmp1);
3986 i++;
3987 if (i != rounds[k]) {
3988 CTR_DoSix(aesenc, xmm_key_tmp0);
3989 } else {
3990 CTR_DoSix(aesenclast, xmm_key_tmp0);
3991 }
3992 i++;
3993 }
3994
3995 // get next PARALLEL_FACTOR blocks into xmm_result registers
3996 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3997 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3998 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3999 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4000 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4001 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4002
4003 __ pxor(xmm_result0, xmm_from0);
4004 __ pxor(xmm_result1, xmm_from1);
4005 __ pxor(xmm_result2, xmm_from2);
4006 __ pxor(xmm_result3, xmm_from3);
4007 __ pxor(xmm_result4, xmm_from4);
4008 __ pxor(xmm_result5, xmm_from5);
4009
4010 // store 6 results into the next 64 bytes of output
4011 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4012 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4013 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4014 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4015 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4016 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4017
4018 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4019 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4020 __ jmp(L_multiBlock_loopTop[k]);
4021
4022 // singleBlock starts here
4023 __ align(OptoLoopAlignment);
4024 __ BIND(L_singleBlockLoopTop[k]);
4025 __ cmpptr(len_reg, 0);
4026 __ jcc(Assembler::lessEqual, L_exit);
4027 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4028 __ movdqa(xmm_result0, xmm_curr_counter);
4029 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4030 __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4031 __ pxor(xmm_result0, xmm_key_tmp0);
4032 for (int i = 1; i < rounds[k]; i++) {
4033 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4034 __ aesenc(xmm_result0, xmm_key_tmp0);
4035 }
4036 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4037 __ aesenclast(xmm_result0, xmm_key_tmp0);
4038 __ cmpptr(len_reg, AESBlockSize);
4039 __ jcc(Assembler::less, L_processTail_insr[k]);
4040 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4041 __ pxor(xmm_result0, xmm_from0);
4042 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4043 __ addptr(pos, AESBlockSize);
4044 __ subptr(len_reg, AESBlockSize);
4045 __ jmp(L_singleBlockLoopTop[k]);
4046 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array
4047 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
4048 __ testptr(len_reg, 8);
4049 __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4050 __ subptr(pos,8);
4051 __ pinsrq(xmm_from0, Address(from, pos), 0);
4052 __ BIND(L_processTail_4_insr[k]);
4053 __ testptr(len_reg, 4);
4054 __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4055 __ subptr(pos,4);
4056 __ pslldq(xmm_from0, 4);
4057 __ pinsrd(xmm_from0, Address(from, pos), 0);
4058 __ BIND(L_processTail_2_insr[k]);
4059 __ testptr(len_reg, 2);
4060 __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4061 __ subptr(pos, 2);
4062 __ pslldq(xmm_from0, 2);
4063 __ pinsrw(xmm_from0, Address(from, pos), 0);
4064 __ BIND(L_processTail_1_insr[k]);
4065 __ testptr(len_reg, 1);
4066 __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4067 __ subptr(pos, 1);
4068 __ pslldq(xmm_from0, 1);
4069 __ pinsrb(xmm_from0, Address(from, pos), 0);
4070 __ BIND(L_processTail_exit_insr[k]);
4071
4072 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4073 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
4074
4075 __ testptr(len_reg, 8);
4076 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
4077 __ pextrq(Address(to, pos), xmm_result0, 0);
4078 __ psrldq(xmm_result0, 8);
4079 __ addptr(pos, 8);
4080 __ BIND(L_processTail_4_extr[k]);
4081 __ testptr(len_reg, 4);
4082 __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4083 __ pextrd(Address(to, pos), xmm_result0, 0);
4084 __ psrldq(xmm_result0, 4);
4085 __ addptr(pos, 4);
4086 __ BIND(L_processTail_2_extr[k]);
4087 __ testptr(len_reg, 2);
4088 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4089 __ pextrw(Address(to, pos), xmm_result0, 0);
4090 __ psrldq(xmm_result0, 2);
4091 __ addptr(pos, 2);
4092 __ BIND(L_processTail_1_extr[k]);
4093 __ testptr(len_reg, 1);
4094 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4095 __ pextrb(Address(to, pos), xmm_result0, 0);
4096
4097 __ BIND(L_processTail_exit_extr[k]);
4098 __ movl(Address(used_addr, 0), len_reg);
4099 __ jmp(L_exit);
4100
4101 }
4102
4103 __ BIND(L_exit);
4104 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4105 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4106 __ pop(rbx); // pop the saved RBX.
4107#ifdef _WIN64
4108 __ movl(rax, len_mem);
4109 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4110 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4111 __ addptr(rsp, 2 * wordSize);
4112#else
4113 __ pop(rax); // return 'len'
4114#endif
4115 __ leave(); // required for proper stackwalking of RuntimeStub frame
4116 __ ret(0);
4117 return start;
4118 }
4119
4120void roundDec(XMMRegister xmm_reg) {
4121 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4122 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4123 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4124 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4125 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4126 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4127 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4128 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4129}
4130
4131void roundDeclast(XMMRegister xmm_reg) {
4132 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4133 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4134 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4135 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4136 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4137 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4138 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4139 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4140}
4141
4142 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4143 __ movdqu(xmmdst, Address(key, offset));
4144 if (xmm_shuf_mask != NULL) {
4145 __ pshufb(xmmdst, xmm_shuf_mask);
4146 } else {
4147 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4148 }
4149 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4150
4151 }
4152
4153address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4154 assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4155 __ align(CodeEntryAlignment);
4156 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4157 address start = __ pc();
4158
4159 const Register from = c_rarg0; // source array address
4160 const Register to = c_rarg1; // destination array address
4161 const Register key = c_rarg2; // key array address
4162 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
4163 // and left with the results of the last encryption block
4164#ifndef _WIN64
4165 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
4166#else
4167 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4168 const Register len_reg = r11; // pick the volatile windows register
4169#endif
4170
4171 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4172 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4173
4174 __ enter();
4175
4176#ifdef _WIN64
4177 // on win64, fill len_reg from stack position
4178 __ movl(len_reg, len_mem);
4179#else
4180 __ push(len_reg); // Save
4181#endif
4182 __ push(rbx);
4183 __ vzeroupper();
4184
4185 // Temporary variable declaration for swapping key bytes
4186 const XMMRegister xmm_key_shuf_mask = xmm1;
4187 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4188
4189 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4190 const Register rounds = rbx;
4191 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4192
4193 const XMMRegister IV = xmm0;
4194 // Load IV and broadcast value to 512-bits
4195 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4196
4197 // Temporary variables for storing round keys
4198 const XMMRegister RK0 = xmm30;
4199 const XMMRegister RK1 = xmm9;
4200 const XMMRegister RK2 = xmm18;
4201 const XMMRegister RK3 = xmm19;
4202 const XMMRegister RK4 = xmm20;
4203 const XMMRegister RK5 = xmm21;
4204 const XMMRegister RK6 = xmm22;
4205 const XMMRegister RK7 = xmm23;
4206 const XMMRegister RK8 = xmm24;
4207 const XMMRegister RK9 = xmm25;
4208 const XMMRegister RK10 = xmm26;
4209
4210 // Load and shuffle key
4211 // the java expanded key ordering is rotated one position from what we want
4212 // so we start from 1*16 here and hit 0*16 last
4213 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4214 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4215 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4216 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4217 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4218 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4219 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4220 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4221 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4222 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4223 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4224
4225 // Variables for storing source cipher text
4226 const XMMRegister S0 = xmm10;
4227 const XMMRegister S1 = xmm11;
4228 const XMMRegister S2 = xmm12;
4229 const XMMRegister S3 = xmm13;
4230 const XMMRegister S4 = xmm14;
4231 const XMMRegister S5 = xmm15;
4232 const XMMRegister S6 = xmm16;
4233 const XMMRegister S7 = xmm17;
4234
4235 // Variables for storing decrypted text
4236 const XMMRegister B0 = xmm1;
4237 const XMMRegister B1 = xmm2;
4238 const XMMRegister B2 = xmm3;
4239 const XMMRegister B3 = xmm4;
4240 const XMMRegister B4 = xmm5;
4241 const XMMRegister B5 = xmm6;
4242 const XMMRegister B6 = xmm7;
4243 const XMMRegister B7 = xmm8;
4244
4245 __ cmpl(rounds, 44);
4246 __ jcc(Assembler::greater, KEY_192);
4247 __ jmp(Loop);
4248
4249 __ BIND(KEY_192);
4250 const XMMRegister RK11 = xmm27;
4251 const XMMRegister RK12 = xmm28;
4252 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4253 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4254
4255 __ cmpl(rounds, 52);
4256 __ jcc(Assembler::greater, KEY_256);
4257 __ jmp(Loop);
4258
4259 __ BIND(KEY_256);
4260 const XMMRegister RK13 = xmm29;
4261 const XMMRegister RK14 = xmm31;
4262 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4263 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4264
4265 __ BIND(Loop);
4266 __ cmpl(len_reg, 512);
4267 __ jcc(Assembler::below, Lcbc_dec_rem);
4268 __ BIND(Loop1);
4269 __ subl(len_reg, 512);
4270 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4271 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4272 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4273 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4274 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4275 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4276 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4277 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4278 __ leaq(from, Address(from, 8 * 64));
4279
4280 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4281 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4282 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4283 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4284 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4285 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4286 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4287 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4288
4289 __ evalignq(IV, S0, IV, 0x06);
4290 __ evalignq(S0, S1, S0, 0x06);
4291 __ evalignq(S1, S2, S1, 0x06);
4292 __ evalignq(S2, S3, S2, 0x06);
4293 __ evalignq(S3, S4, S3, 0x06);
4294 __ evalignq(S4, S5, S4, 0x06);
4295 __ evalignq(S5, S6, S5, 0x06);
4296 __ evalignq(S6, S7, S6, 0x06);
4297
4298 roundDec(RK2);
4299 roundDec(RK3);
4300 roundDec(RK4);
4301 roundDec(RK5);
4302 roundDec(RK6);
4303 roundDec(RK7);
4304 roundDec(RK8);
4305 roundDec(RK9);
4306 roundDec(RK10);
4307
4308 __ cmpl(rounds, 44);
4309 __ jcc(Assembler::belowEqual, L_128);
4310 roundDec(RK11);
4311 roundDec(RK12);
4312
4313 __ cmpl(rounds, 52);
4314 __ jcc(Assembler::belowEqual, L_192);
4315 roundDec(RK13);
4316 roundDec(RK14);
4317
4318 __ BIND(L_256);
4319 roundDeclast(RK0);
4320 __ jmp(Loop2);
4321
4322 __ BIND(L_128);
4323 roundDeclast(RK0);
4324 __ jmp(Loop2);
4325
4326 __ BIND(L_192);
4327 roundDeclast(RK0);
4328
4329 __ BIND(Loop2);
4330 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4331 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4332 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4333 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4334 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4335 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4336 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4337 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4338 __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4339
4340 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4341 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4342 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4343 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4344 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4345 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4346 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4347 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4348 __ leaq(to, Address(to, 8 * 64));
4349 __ jmp(Loop);
4350
4351 __ BIND(Lcbc_dec_rem);
4352 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4353
4354 __ BIND(Lcbc_dec_rem_loop);
4355 __ subl(len_reg, 16);
4356 __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4357
4358 __ movdqu(S0, Address(from, 0));
4359 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4360 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4361 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4362 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4363 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4364 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4365 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4366 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4367 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4368 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4369 __ cmpl(rounds, 44);
4370 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4371
4372 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4373 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4374 __ cmpl(rounds, 52);
4375 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4376
4377 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4378 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4379
4380 __ BIND(Lcbc_dec_rem_last);
4381 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4382
4383 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4384 __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4385 __ movdqu(Address(to, 0), B0);
4386 __ leaq(from, Address(from, 16));
4387 __ leaq(to, Address(to, 16));
4388 __ jmp(Lcbc_dec_rem_loop);
4389
4390 __ BIND(Lcbc_dec_ret);
4391 __ movdqu(Address(rvec, 0), IV);
4392
4393 // Zero out the round keys
4394 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4395 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4396 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4397 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4398 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4399 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4400 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4401 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4402 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4403 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4404 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4405 __ cmpl(rounds, 44);
4406 __ jcc(Assembler::belowEqual, Lcbc_exit);
4407 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4408 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4409 __ cmpl(rounds, 52);
4410 __ jcc(Assembler::belowEqual, Lcbc_exit);
4411 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4412 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4413
4414 __ BIND(Lcbc_exit);
4415 __ pop(rbx);
4416#ifdef _WIN64
4417 __ movl(rax, len_mem);
4418#else
4419 __ pop(rax); // return length
4420#endif
4421 __ leave(); // required for proper stackwalking of RuntimeStub frame
4422 __ ret(0);
4423 return start;
4424}
4425
4426// Polynomial x^128+x^127+x^126+x^121+1
4427address ghash_polynomial_addr() {
4428 __ align(CodeEntryAlignment);
4429 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4430 address start = __ pc();
4431 __ emit_data64(0x0000000000000001, relocInfo::none);
4432 __ emit_data64(0xc200000000000000, relocInfo::none);
4433 return start;
4434}
4435
4436address ghash_shufflemask_addr() {
4437 __ align(CodeEntryAlignment);
4438 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4439 address start = __ pc();
4440 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4441 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4442 return start;
4443}
4444
4445// Ghash single and multi block operations using AVX instructions
4446address generate_avx_ghash_processBlocks() {
4447 __ align(CodeEntryAlignment);
4448
4449 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4450 address start = __ pc();
4451
4452 // arguments
4453 const Register state = c_rarg0;
4454 const Register htbl = c_rarg1;
4455 const Register data = c_rarg2;
4456 const Register blocks = c_rarg3;
4457 __ enter();
4458 // Save state before entering routine
4459 __ avx_ghash(state, htbl, data, blocks);
4460 __ leave(); // required for proper stackwalking of RuntimeStub frame
4461 __ ret(0);
4462 return start;
4463}
4464
4465 // byte swap x86 long
4466 address generate_ghash_long_swap_mask() {
4467 __ align(CodeEntryAlignment);
4468 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4469 address start = __ pc();
4470 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4471 __ emit_data64(0x0706050403020100, relocInfo::none );
4472 return start;
4473 }
4474
4475 // byte swap x86 byte array
4476 address generate_ghash_byte_swap_mask() {
4477 __ align(CodeEntryAlignment);
4478 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4479 address start = __ pc();
4480 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4481 __ emit_data64(0x0001020304050607, relocInfo::none );
4482 return start;
4483 }
4484
4485 /* Single and multi-block ghash operations */
4486 address generate_ghash_processBlocks() {
4487 __ align(CodeEntryAlignment);
4488 Label L_ghash_loop, L_exit;
4489 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4490 address start = __ pc();
4491
4492 const Register state = c_rarg0;
4493 const Register subkeyH = c_rarg1;
4494 const Register data = c_rarg2;
4495 const Register blocks = c_rarg3;
4496
4497 const XMMRegister xmm_temp0 = xmm0;
4498 const XMMRegister xmm_temp1 = xmm1;
4499 const XMMRegister xmm_temp2 = xmm2;
4500 const XMMRegister xmm_temp3 = xmm3;
4501 const XMMRegister xmm_temp4 = xmm4;
4502 const XMMRegister xmm_temp5 = xmm5;
4503 const XMMRegister xmm_temp6 = xmm6;
4504 const XMMRegister xmm_temp7 = xmm7;
4505 const XMMRegister xmm_temp8 = xmm8;
4506 const XMMRegister xmm_temp9 = xmm9;
4507 const XMMRegister xmm_temp10 = xmm10;
4508
4509 __ enter();
4510
4511 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4512
4513 __ movdqu(xmm_temp0, Address(state, 0));
4514 __ pshufb(xmm_temp0, xmm_temp10);
4515
4516
4517 __ BIND(L_ghash_loop);
4518 __ movdqu(xmm_temp2, Address(data, 0));
4519 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4520
4521 __ movdqu(xmm_temp1, Address(subkeyH, 0));
4522 __ pshufb(xmm_temp1, xmm_temp10);
4523
4524 __ pxor(xmm_temp0, xmm_temp2);
4525
4526 //
4527 // Multiply with the hash key
4528 //
4529 __ movdqu(xmm_temp3, xmm_temp0);
4530 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
4531 __ movdqu(xmm_temp4, xmm_temp0);
4532 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
4533
4534 __ movdqu(xmm_temp5, xmm_temp0);
4535 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
4536 __ movdqu(xmm_temp6, xmm_temp0);
4537 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
4538
4539 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
4540
4541 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
4542 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
4543 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
4544 __ pxor(xmm_temp3, xmm_temp5);
4545 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
4546 // of the carry-less multiplication of
4547 // xmm0 by xmm1.
4548
4549 // We shift the result of the multiplication by one bit position
4550 // to the left to cope for the fact that the bits are reversed.
4551 __ movdqu(xmm_temp7, xmm_temp3);
4552 __ movdqu(xmm_temp8, xmm_temp6);
4553 __ pslld(xmm_temp3, 1);
4554 __ pslld(xmm_temp6, 1);
4555 __ psrld(xmm_temp7, 31);
4556 __ psrld(xmm_temp8, 31);
4557 __ movdqu(xmm_temp9, xmm_temp7);
4558 __ pslldq(xmm_temp8, 4);
4559 __ pslldq(xmm_temp7, 4);
4560 __ psrldq(xmm_temp9, 12);
4561 __ por(xmm_temp3, xmm_temp7);
4562 __ por(xmm_temp6, xmm_temp8);
4563 __ por(xmm_temp6, xmm_temp9);
4564
4565 //
4566 // First phase of the reduction
4567 //
4568 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4569 // independently.
4570 __ movdqu(xmm_temp7, xmm_temp3);
4571 __ movdqu(xmm_temp8, xmm_temp3);
4572 __ movdqu(xmm_temp9, xmm_temp3);
4573 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
4574 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
4575 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
4576 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
4577 __ pxor(xmm_temp7, xmm_temp9);
4578 __ movdqu(xmm_temp8, xmm_temp7);
4579 __ pslldq(xmm_temp7, 12);
4580 __ psrldq(xmm_temp8, 4);
4581 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
4582
4583 //
4584 // Second phase of the reduction
4585 //
4586 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4587 // shift operations.
4588 __ movdqu(xmm_temp2, xmm_temp3);
4589 __ movdqu(xmm_temp4, xmm_temp3);
4590 __ movdqu(xmm_temp5, xmm_temp3);
4591 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4592 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4593 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4594 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4595 __ pxor(xmm_temp2, xmm_temp5);
4596 __ pxor(xmm_temp2, xmm_temp8);
4597 __ pxor(xmm_temp3, xmm_temp2);
4598 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4599
4600 __ decrement(blocks);
4601 __ jcc(Assembler::zero, L_exit);
4602 __ movdqu(xmm_temp0, xmm_temp6);
4603 __ addptr(data, 16);
4604 __ jmp(L_ghash_loop);
4605
4606 __ BIND(L_exit);
4607 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4608 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4609 __ leave();
4610 __ ret(0);
4611 return start;
4612 }
4613
4614 //base64 character set
4615 address base64_charset_addr() {
4616 __ align(CodeEntryAlignment);
4617 StubCodeMark mark(this, "StubRoutines", "base64_charset");
4618 address start = __ pc();
4619 __ emit_data64(0x0000004200000041, relocInfo::none);
4620 __ emit_data64(0x0000004400000043, relocInfo::none);
4621 __ emit_data64(0x0000004600000045, relocInfo::none);
4622 __ emit_data64(0x0000004800000047, relocInfo::none);
4623 __ emit_data64(0x0000004a00000049, relocInfo::none);
4624 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4625 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4626 __ emit_data64(0x000000500000004f, relocInfo::none);
4627 __ emit_data64(0x0000005200000051, relocInfo::none);
4628 __ emit_data64(0x0000005400000053, relocInfo::none);
4629 __ emit_data64(0x0000005600000055, relocInfo::none);
4630 __ emit_data64(0x0000005800000057, relocInfo::none);
4631 __ emit_data64(0x0000005a00000059, relocInfo::none);
4632 __ emit_data64(0x0000006200000061, relocInfo::none);
4633 __ emit_data64(0x0000006400000063, relocInfo::none);
4634 __ emit_data64(0x0000006600000065, relocInfo::none);
4635 __ emit_data64(0x0000006800000067, relocInfo::none);
4636 __ emit_data64(0x0000006a00000069, relocInfo::none);
4637 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4638 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4639 __ emit_data64(0x000000700000006f, relocInfo::none);
4640 __ emit_data64(0x0000007200000071, relocInfo::none);
4641 __ emit_data64(0x0000007400000073, relocInfo::none);
4642 __ emit_data64(0x0000007600000075, relocInfo::none);
4643 __ emit_data64(0x0000007800000077, relocInfo::none);
4644 __ emit_data64(0x0000007a00000079, relocInfo::none);
4645 __ emit_data64(0x0000003100000030, relocInfo::none);
4646 __ emit_data64(0x0000003300000032, relocInfo::none);
4647 __ emit_data64(0x0000003500000034, relocInfo::none);
4648 __ emit_data64(0x0000003700000036, relocInfo::none);
4649 __ emit_data64(0x0000003900000038, relocInfo::none);
4650 __ emit_data64(0x0000002f0000002b, relocInfo::none);
4651 return start;
4652 }
4653
4654 //base64 url character set
4655 address base64url_charset_addr() {
4656 __ align(CodeEntryAlignment);
4657 StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4658 address start = __ pc();
4659 __ emit_data64(0x0000004200000041, relocInfo::none);
4660 __ emit_data64(0x0000004400000043, relocInfo::none);
4661 __ emit_data64(0x0000004600000045, relocInfo::none);
4662 __ emit_data64(0x0000004800000047, relocInfo::none);
4663 __ emit_data64(0x0000004a00000049, relocInfo::none);
4664 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4665 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4666 __ emit_data64(0x000000500000004f, relocInfo::none);
4667 __ emit_data64(0x0000005200000051, relocInfo::none);
4668 __ emit_data64(0x0000005400000053, relocInfo::none);
4669 __ emit_data64(0x0000005600000055, relocInfo::none);
4670 __ emit_data64(0x0000005800000057, relocInfo::none);
4671 __ emit_data64(0x0000005a00000059, relocInfo::none);
4672 __ emit_data64(0x0000006200000061, relocInfo::none);
4673 __ emit_data64(0x0000006400000063, relocInfo::none);
4674 __ emit_data64(0x0000006600000065, relocInfo::none);
4675 __ emit_data64(0x0000006800000067, relocInfo::none);
4676 __ emit_data64(0x0000006a00000069, relocInfo::none);
4677 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4678 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4679 __ emit_data64(0x000000700000006f, relocInfo::none);
4680 __ emit_data64(0x0000007200000071, relocInfo::none);
4681 __ emit_data64(0x0000007400000073, relocInfo::none);
4682 __ emit_data64(0x0000007600000075, relocInfo::none);
4683 __ emit_data64(0x0000007800000077, relocInfo::none);
4684 __ emit_data64(0x0000007a00000079, relocInfo::none);
4685 __ emit_data64(0x0000003100000030, relocInfo::none);
4686 __ emit_data64(0x0000003300000032, relocInfo::none);
4687 __ emit_data64(0x0000003500000034, relocInfo::none);
4688 __ emit_data64(0x0000003700000036, relocInfo::none);
4689 __ emit_data64(0x0000003900000038, relocInfo::none);
4690 __ emit_data64(0x0000005f0000002d, relocInfo::none);
4691
4692 return start;
4693 }
4694
4695 address base64_bswap_mask_addr() {
4696 __ align(CodeEntryAlignment);
4697 StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
4698 address start = __ pc();
4699 __ emit_data64(0x0504038002010080, relocInfo::none);
4700 __ emit_data64(0x0b0a098008070680, relocInfo::none);
4701 __ emit_data64(0x0908078006050480, relocInfo::none);
4702 __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
4703 __ emit_data64(0x0605048003020180, relocInfo::none);
4704 __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
4705 __ emit_data64(0x0504038002010080, relocInfo::none);
4706 __ emit_data64(0x0b0a098008070680, relocInfo::none);
4707
4708 return start;
4709 }
4710
4711 address base64_right_shift_mask_addr() {
4712 __ align(CodeEntryAlignment);
4713 StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
4714 address start = __ pc();
4715 __ emit_data64(0x0006000400020000, relocInfo::none);
4716 __ emit_data64(0x0006000400020000, relocInfo::none);
4717 __ emit_data64(0x0006000400020000, relocInfo::none);
4718 __ emit_data64(0x0006000400020000, relocInfo::none);
4719 __ emit_data64(0x0006000400020000, relocInfo::none);
4720 __ emit_data64(0x0006000400020000, relocInfo::none);
4721 __ emit_data64(0x0006000400020000, relocInfo::none);
4722 __ emit_data64(0x0006000400020000, relocInfo::none);
4723
4724 return start;
4725 }
4726
4727 address base64_left_shift_mask_addr() {
4728 __ align(CodeEntryAlignment);
4729 StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
4730 address start = __ pc();
4731 __ emit_data64(0x0000000200040000, relocInfo::none);
4732 __ emit_data64(0x0000000200040000, relocInfo::none);
4733 __ emit_data64(0x0000000200040000, relocInfo::none);
4734 __ emit_data64(0x0000000200040000, relocInfo::none);
4735 __ emit_data64(0x0000000200040000, relocInfo::none);
4736 __ emit_data64(0x0000000200040000, relocInfo::none);
4737 __ emit_data64(0x0000000200040000, relocInfo::none);
4738 __ emit_data64(0x0000000200040000, relocInfo::none);
4739
4740 return start;
4741 }
4742
4743 address base64_and_mask_addr() {
4744 __ align(CodeEntryAlignment);
4745 StubCodeMark mark(this, "StubRoutines", "and_mask");
4746 address start = __ pc();
4747 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4748 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4749 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4750 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4751 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4752 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4753 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4754 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4755 return start;
4756 }
4757
4758 address base64_gather_mask_addr() {
4759 __ align(CodeEntryAlignment);
4760 StubCodeMark mark(this, "StubRoutines", "gather_mask");
4761 address start = __ pc();
4762 __ emit_data64(0xffffffffffffffff, relocInfo::none);
4763 return start;
4764 }
4765
4766// Code for generating Base64 encoding.
4767// Intrinsic function prototype in Base64.java:
4768// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4769 address generate_base64_encodeBlock() {
4770 __ align(CodeEntryAlignment);
4771 StubCodeMark mark(this, "StubRoutines", "implEncode");
4772 address start = __ pc();
4773 __ enter();
4774
4775 // Save callee-saved registers before using them
4776 __ push(r12);
4777 __ push(r13);
4778 __ push(r14);
4779 __ push(r15);
4780
4781 // arguments
4782 const Register source = c_rarg0; // Source Array
4783 const Register start_offset = c_rarg1; // start offset
4784 const Register end_offset = c_rarg2; // end offset
4785 const Register dest = c_rarg3; // destination array
4786
4787#ifndef _WIN64
4788 const Register dp = c_rarg4; // Position for writing to dest array
4789 const Register isURL = c_rarg5;// Base64 or URL character set
4790#else
4791 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
4792 const Address isURL_mem(rbp, 7 * wordSize);
4793 const Register isURL = r10; // pick the volatile windows register
4794 const Register dp = r12;
4795 __ movl(dp, dp_mem);
4796 __ movl(isURL, isURL_mem);
4797#endif
4798
4799 const Register length = r14;
4800 Label L_process80, L_process32, L_process3, L_exit, L_processdata;
4801
4802 // calculate length from offsets
4803 __ movl(length, end_offset);
4804 __ subl(length, start_offset);
4805 __ cmpl(length, 0);
4806 __ jcc(Assembler::lessEqual, L_exit);
4807
4808 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
4809 // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
4810 __ cmpl(isURL, 0);
4811 __ jcc(Assembler::equal, L_processdata);
4812 __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
4813
4814 // load masks required for encoding data
4815 __ BIND(L_processdata);
4816 __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
4817 // Set 64 bits of K register.
4818 __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
4819 __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
4820 __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
4821 __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
4822 __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
4823
4824 // Vector Base64 implementation, producing 96 bytes of encoded data
4825 __ BIND(L_process80);
4826 __ cmpl(length, 80);
4827 __ jcc(Assembler::below, L_process32);
4828 __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
4829 __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
4830 __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
4831
4832 //permute the input data in such a manner that we have continuity of the source
4833 __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
4834 __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
4835 __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
4836
4837 //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
4838 //we can deal with 12 bytes at a time in a 128 bit register
4839 __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
4840 __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
4841 __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
4842
4843 //convert byte to word. Each 128 bit register will have 6 bytes for processing
4844 __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
4845 __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
4846 __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
4847
4848 // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
4849 __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit);
4850 __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
4851 __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
4852
4853 __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
4854 __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
4855 __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
4856
4857 __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
4858 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
4859 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
4860
4861 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4862 __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4863 __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4864
4865 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
4866 __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
4867 __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
4868
4869 // Get the final 4*6 bits base64 encoding
4870 __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
4871 __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
4872 __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
4873
4874 // Shift
4875 __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4876 __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4877 __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4878
4879 // look up 6 bits in the base64 character set to fetch the encoding
4880 // we are converting word to dword as gather instructions need dword indices for looking up encoding
4881 __ vextracti64x4(xmm6, xmm3, 0);
4882 __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
4883 __ vextracti64x4(xmm6, xmm3, 1);
4884 __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
4885
4886 __ vextracti64x4(xmm6, xmm4, 0);
4887 __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
4888 __ vextracti64x4(xmm6, xmm4, 1);
4889 __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
4890
4891 __ vextracti64x4(xmm4, xmm5, 0);
4892 __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
4893
4894 __ vextracti64x4(xmm4, xmm5, 1);
4895 __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
4896
4897 __ kmovql(k2, k3);
4898 __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
4899 __ kmovql(k2, k3);
4900 __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
4901 __ kmovql(k2, k3);
4902 __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
4903 __ kmovql(k2, k3);
4904 __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
4905 __ kmovql(k2, k3);
4906 __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
4907 __ kmovql(k2, k3);
4908 __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
4909
4910 //Down convert dword to byte. Final output is 16*6 = 96 bytes long
4911 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
4912 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
4913 __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
4914 __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
4915 __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
4916 __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
4917
4918 __ addq(dest, 96);
4919 __ addq(source, 72);
4920 __ subq(length, 72);
4921 __ jmp(L_process80);
4922
4923 // Vector Base64 implementation generating 32 bytes of encoded data
4924 __ BIND(L_process32);
4925 __ cmpl(length, 32);
4926 __ jcc(Assembler::below, L_process3);
4927 __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
4928 __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
4929 __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
4930 __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
4931 __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
4932 __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
4933
4934 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
4935 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4936 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
4937 __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
4938 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
4939 __ vextracti64x4(xmm9, xmm1, 0);
4940 __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
4941 __ vextracti64x4(xmm9, xmm1, 1);
4942 __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit);
4943 __ kmovql(k2, k3);
4944 __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
4945 __ kmovql(k2, k3);
4946 __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
4947 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
4948 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
4949 __ subq(length, 24);
4950 __ addq(dest, 32);
4951 __ addq(source, 24);
4952 __ jmp(L_process32);
4953
4954 // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
4955 /* This code corresponds to the scalar version of the following snippet in Base64.java
4956 ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
4957 ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
4958 ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
4959 ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
4960 ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
4961 __ BIND(L_process3);
4962 __ cmpl(length, 3);
4963 __ jcc(Assembler::below, L_exit);
4964 // Read 1 byte at a time
4965 __ movzbl(rax, Address(source, start_offset));
4966 __ shll(rax, 0x10);
4967 __ movl(r15, rax);
4968 __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
4969 __ shll(rax, 0x8);
4970 __ movzwl(rax, rax);
4971 __ orl(r15, rax);
4972 __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
4973 __ orl(rax, r15);
4974 // Save 3 bytes read in r15
4975 __ movl(r15, rax);
4976 __ shrl(rax, 0x12);
4977 __ andl(rax, 0x3f);
4978 // rax contains the index, r11 contains base64 lookup table
4979 __ movb(rax, Address(r11, rax, Address::times_4));
4980 // Write the encoded byte to destination
4981 __ movb(Address(dest, dp, Address::times_1, 0), rax);
4982 __ movl(rax, r15);
4983 __ shrl(rax, 0xc);
4984 __ andl(rax, 0x3f);
4985 __ movb(rax, Address(r11, rax, Address::times_4));
4986 __ movb(Address(dest, dp, Address::times_1, 1), rax);
4987 __ movl(rax, r15);
4988 __ shrl(rax, 0x6);
4989 __ andl(rax, 0x3f);
4990 __ movb(rax, Address(r11, rax, Address::times_4));
4991 __ movb(Address(dest, dp, Address::times_1, 2), rax);
4992 __ movl(rax, r15);
4993 __ andl(rax, 0x3f);
4994 __ movb(rax, Address(r11, rax, Address::times_4));
4995 __ movb(Address(dest, dp, Address::times_1, 3), rax);
4996 __ subl(length, 3);
4997 __ addq(dest, 4);
4998 __ addq(source, 3);
4999 __ jmp(L_process3);
5000 __ BIND(L_exit);
5001 __ pop(r15);
5002 __ pop(r14);
5003 __ pop(r13);
5004 __ pop(r12);
5005 __ leave();
5006 __ ret(0);
5007 return start;
5008 }
5009
5010 /**
5011 * Arguments:
5012 *
5013 * Inputs:
5014 * c_rarg0 - int crc
5015 * c_rarg1 - byte* buf
5016 * c_rarg2 - int length
5017 *
5018 * Ouput:
5019 * rax - int crc result
5020 */
5021 address generate_updateBytesCRC32() {
5022 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5023
5024 __ align(CodeEntryAlignment);
5025 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5026
5027 address start = __ pc();
5028 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5029 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5030 // rscratch1: r10
5031 const Register crc = c_rarg0; // crc
5032 const Register buf = c_rarg1; // source java byte array address
5033 const Register len = c_rarg2; // length
5034 const Register table = c_rarg3; // crc_table address (reuse register)
5035 const Register tmp = r11;
5036 assert_different_registers(crc, buf, len, table, tmp, rax);
5037
5038 BLOCK_COMMENT("Entry:");
5039 __ enter(); // required for proper stackwalking of RuntimeStub frame
5040
5041 __ kernel_crc32(crc, buf, len, table, tmp);
5042
5043 __ movl(rax, crc);
5044 __ vzeroupper();
5045 __ leave(); // required for proper stackwalking of RuntimeStub frame
5046 __ ret(0);
5047
5048 return start;
5049 }
5050
5051 /**
5052 * Arguments:
5053 *
5054 * Inputs:
5055 * c_rarg0 - int crc
5056 * c_rarg1 - byte* buf
5057 * c_rarg2 - long length
5058 * c_rarg3 - table_start - optional (present only when doing a library_call,
5059 * not used by x86 algorithm)
5060 *
5061 * Ouput:
5062 * rax - int crc result
5063 */
5064 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5065 assert(UseCRC32CIntrinsics, "need SSE4_2");
5066 __ align(CodeEntryAlignment);
5067 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5068 address start = __ pc();
5069 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
5070 //Windows RCX RDX R8 R9 none none XMM0..XMM3
5071 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
5072 const Register crc = c_rarg0; // crc
5073 const Register buf = c_rarg1; // source java byte array address
5074 const Register len = c_rarg2; // length
5075 const Register a = rax;
5076 const Register j = r9;
5077 const Register k = r10;
5078 const Register l = r11;
5079#ifdef _WIN64
5080 const Register y = rdi;
5081 const Register z = rsi;
5082#else
5083 const Register y = rcx;
5084 const Register z = r8;
5085#endif
5086 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5087
5088 BLOCK_COMMENT("Entry:");
5089 __ enter(); // required for proper stackwalking of RuntimeStub frame
5090#ifdef _WIN64
5091 __ push(y);
5092 __ push(z);
5093#endif
5094 __ crc32c_ipl_alg2_alt2(crc, buf, len,
5095 a, j, k,
5096 l, y, z,
5097 c_farg0, c_farg1, c_farg2,
5098 is_pclmulqdq_supported);
5099 __ movl(rax, crc);
5100#ifdef _WIN64
5101 __ pop(z);
5102 __ pop(y);
5103#endif
5104 __ vzeroupper();
5105 __ leave(); // required for proper stackwalking of RuntimeStub frame
5106 __ ret(0);
5107
5108 return start;
5109 }
5110
5111 /**
5112 * Arguments:
5113 *
5114 * Input:
5115 * c_rarg0 - x address
5116 * c_rarg1 - x length
5117 * c_rarg2 - y address
5118 * c_rarg3 - y length
5119 * not Win64
5120 * c_rarg4 - z address
5121 * c_rarg5 - z length
5122 * Win64
5123 * rsp+40 - z address
5124 * rsp+48 - z length
5125 */
5126 address generate_multiplyToLen() {
5127 __ align(CodeEntryAlignment);
5128 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5129
5130 address start = __ pc();
5131 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5132 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5133 const Register x = rdi;
5134 const Register xlen = rax;
5135 const Register y = rsi;
5136 const Register ylen = rcx;
5137 const Register z = r8;
5138 const Register zlen = r11;
5139
5140 // Next registers will be saved on stack in multiply_to_len().
5141 const Register tmp1 = r12;
5142 const Register tmp2 = r13;
5143 const Register tmp3 = r14;
5144 const Register tmp4 = r15;
5145 const Register tmp5 = rbx;
5146
5147 BLOCK_COMMENT("Entry:");
5148 __ enter(); // required for proper stackwalking of RuntimeStub frame
5149
5150#ifndef _WIN64
5151 __ movptr(zlen, r9); // Save r9 in r11 - zlen
5152#endif
5153 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5154 // ylen => rcx, z => r8, zlen => r11
5155 // r9 and r10 may be used to save non-volatile registers
5156#ifdef _WIN64
5157 // last 2 arguments (#4, #5) are on stack on Win64
5158 __ movptr(z, Address(rsp, 6 * wordSize));
5159 __ movptr(zlen, Address(rsp, 7 * wordSize));
5160#endif
5161
5162 __ movptr(xlen, rsi);
5163 __ movptr(y, rdx);
5164 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5165
5166 restore_arg_regs();
5167
5168 __ leave(); // required for proper stackwalking of RuntimeStub frame
5169 __ ret(0);
5170
5171 return start;
5172 }
5173
5174 /**
5175 * Arguments:
5176 *
5177 * Input:
5178 * c_rarg0 - obja address
5179 * c_rarg1 - objb address
5180 * c_rarg3 - length length
5181 * c_rarg4 - scale log2_array_indxscale
5182 *
5183 * Output:
5184 * rax - int >= mismatched index, < 0 bitwise complement of tail
5185 */
5186 address generate_vectorizedMismatch() {
5187 __ align(CodeEntryAlignment);
5188 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5189 address start = __ pc();
5190
5191 BLOCK_COMMENT("Entry:");
5192 __ enter();
5193
5194#ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5195 const Register scale = c_rarg0; //rcx, will exchange with r9
5196 const Register objb = c_rarg1; //rdx
5197 const Register length = c_rarg2; //r8
5198 const Register obja = c_rarg3; //r9
5199 __ xchgq(obja, scale); //now obja and scale contains the correct contents
5200
5201 const Register tmp1 = r10;
5202 const Register tmp2 = r11;
5203#endif
5204#ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5205 const Register obja = c_rarg0; //U:rdi
5206 const Register objb = c_rarg1; //U:rsi
5207 const Register length = c_rarg2; //U:rdx
5208 const Register scale = c_rarg3; //U:rcx
5209 const Register tmp1 = r8;
5210 const Register tmp2 = r9;
5211#endif
5212 const Register result = rax; //return value
5213 const XMMRegister vec0 = xmm0;
5214 const XMMRegister vec1 = xmm1;
5215 const XMMRegister vec2 = xmm2;
5216
5217 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5218
5219 __ vzeroupper();
5220 __ leave();
5221 __ ret(0);
5222
5223 return start;
5224 }
5225
5226/**
5227 * Arguments:
5228 *
5229 // Input:
5230 // c_rarg0 - x address
5231 // c_rarg1 - x length
5232 // c_rarg2 - z address
5233 // c_rarg3 - z lenth
5234 *
5235 */
5236 address generate_squareToLen() {
5237
5238 __ align(CodeEntryAlignment);
5239 StubCodeMark mark(this, "StubRoutines", "squareToLen");
5240
5241 address start = __ pc();
5242 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5243 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5244 const Register x = rdi;
5245 const Register len = rsi;
5246 const Register z = r8;
5247 const Register zlen = rcx;
5248
5249 const Register tmp1 = r12;
5250 const Register tmp2 = r13;
5251 const Register tmp3 = r14;
5252 const Register tmp4 = r15;
5253 const Register tmp5 = rbx;
5254
5255 BLOCK_COMMENT("Entry:");
5256 __ enter(); // required for proper stackwalking of RuntimeStub frame
5257
5258 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5259 // zlen => rcx
5260 // r9 and r10 may be used to save non-volatile registers
5261 __ movptr(r8, rdx);
5262 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5263
5264 restore_arg_regs();
5265
5266 __ leave(); // required for proper stackwalking of RuntimeStub frame
5267 __ ret(0);
5268
5269 return start;
5270 }
5271
5272 address generate_method_entry_barrier() {
5273 __ align(CodeEntryAlignment);
5274 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5275
5276 Label deoptimize_label;
5277
5278 address start = __ pc();
5279
5280 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5281
5282 BLOCK_COMMENT("Entry:");
5283 __ enter(); // save rbp
5284
5285 // save c_rarg0, because we want to use that value.
5286 // We could do without it but then we depend on the number of slots used by pusha
5287 __ push(c_rarg0);
5288
5289 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5290
5291 __ pusha();
5292
5293 // The method may have floats as arguments, and we must spill them before calling
5294 // the VM runtime.
5295 assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5296 const int xmm_size = wordSize * 2;
5297 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5298 __ subptr(rsp, xmm_spill_size);
5299 __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5300 __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5301 __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5302 __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5303 __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5304 __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5305 __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5306 __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5307
5308 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5309
5310 __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5311 __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5312 __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5313 __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5314 __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5315 __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5316 __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5317 __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5318 __ addptr(rsp, xmm_spill_size);
5319
5320 __ cmpl(rax, 1); // 1 means deoptimize
5321 __ jcc(Assembler::equal, deoptimize_label);
5322
5323 __ popa();
5324 __ pop(c_rarg0);
5325
5326 __ leave();
5327
5328 __ addptr(rsp, 1 * wordSize); // cookie
5329 __ ret(0);
5330
5331
5332 __ BIND(deoptimize_label);
5333
5334 __ popa();
5335 __ pop(c_rarg0);
5336
5337 __ leave();
5338
5339 // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5340 // here while still having a correct stack is valuable
5341 __ testptr(rsp, Address(rsp, 0));
5342
5343 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5344 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5345
5346 return start;
5347 }
5348
5349 /**
5350 * Arguments:
5351 *
5352 * Input:
5353 * c_rarg0 - out address
5354 * c_rarg1 - in address
5355 * c_rarg2 - offset
5356 * c_rarg3 - len
5357 * not Win64
5358 * c_rarg4 - k
5359 * Win64
5360 * rsp+40 - k
5361 */
5362 address generate_mulAdd() {
5363 __ align(CodeEntryAlignment);
5364 StubCodeMark mark(this, "StubRoutines", "mulAdd");
5365
5366 address start = __ pc();
5367 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5368 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5369 const Register out = rdi;
5370 const Register in = rsi;
5371 const Register offset = r11;
5372 const Register len = rcx;
5373 const Register k = r8;
5374
5375 // Next registers will be saved on stack in mul_add().
5376 const Register tmp1 = r12;
5377 const Register tmp2 = r13;
5378 const Register tmp3 = r14;
5379 const Register tmp4 = r15;
5380 const Register tmp5 = rbx;
5381
5382 BLOCK_COMMENT("Entry:");
5383 __ enter(); // required for proper stackwalking of RuntimeStub frame
5384
5385 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5386 // len => rcx, k => r8
5387 // r9 and r10 may be used to save non-volatile registers
5388#ifdef _WIN64
5389 // last argument is on stack on Win64
5390 __ movl(k, Address(rsp, 6 * wordSize));
5391#endif
5392 __ movptr(r11, rdx); // move offset in rdx to offset(r11)
5393 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5394
5395 restore_arg_regs();
5396
5397 __ leave(); // required for proper stackwalking of RuntimeStub frame
5398 __ ret(0);
5399
5400 return start;
5401 }
5402
5403 address generate_libmExp() {
5404 StubCodeMark mark(this, "StubRoutines", "libmExp");
5405
5406 address start = __ pc();
5407
5408 const XMMRegister x0 = xmm0;
5409 const XMMRegister x1 = xmm1;
5410 const XMMRegister x2 = xmm2;
5411 const XMMRegister x3 = xmm3;
5412
5413 const XMMRegister x4 = xmm4;
5414 const XMMRegister x5 = xmm5;
5415 const XMMRegister x6 = xmm6;
5416 const XMMRegister x7 = xmm7;
5417
5418 const Register tmp = r11;
5419
5420 BLOCK_COMMENT("Entry:");
5421 __ enter(); // required for proper stackwalking of RuntimeStub frame
5422
5423 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5424
5425 __ leave(); // required for proper stackwalking of RuntimeStub frame
5426 __ ret(0);
5427
5428 return start;
5429
5430 }
5431
5432 address generate_libmLog() {
5433 StubCodeMark mark(this, "StubRoutines", "libmLog");
5434
5435 address start = __ pc();
5436
5437 const XMMRegister x0 = xmm0;
5438 const XMMRegister x1 = xmm1;
5439 const XMMRegister x2 = xmm2;
5440 const XMMRegister x3 = xmm3;
5441
5442 const XMMRegister x4 = xmm4;
5443 const XMMRegister x5 = xmm5;
5444 const XMMRegister x6 = xmm6;
5445 const XMMRegister x7 = xmm7;
5446
5447 const Register tmp1 = r11;
5448 const Register tmp2 = r8;
5449
5450 BLOCK_COMMENT("Entry:");
5451 __ enter(); // required for proper stackwalking of RuntimeStub frame
5452
5453 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5454
5455 __ leave(); // required for proper stackwalking of RuntimeStub frame
5456 __ ret(0);
5457
5458 return start;
5459
5460 }
5461
5462 address generate_libmLog10() {
5463 StubCodeMark mark(this, "StubRoutines", "libmLog10");
5464
5465 address start = __ pc();
5466
5467 const XMMRegister x0 = xmm0;
5468 const XMMRegister x1 = xmm1;
5469 const XMMRegister x2 = xmm2;
5470 const XMMRegister x3 = xmm3;
5471
5472 const XMMRegister x4 = xmm4;
5473 const XMMRegister x5 = xmm5;
5474 const XMMRegister x6 = xmm6;
5475 const XMMRegister x7 = xmm7;
5476
5477 const Register tmp = r11;
5478
5479 BLOCK_COMMENT("Entry:");
5480 __ enter(); // required for proper stackwalking of RuntimeStub frame
5481
5482 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5483
5484 __ leave(); // required for proper stackwalking of RuntimeStub frame
5485 __ ret(0);
5486
5487 return start;
5488
5489 }
5490
5491 address generate_libmPow() {
5492 StubCodeMark mark(this, "StubRoutines", "libmPow");
5493
5494 address start = __ pc();
5495
5496 const XMMRegister x0 = xmm0;
5497 const XMMRegister x1 = xmm1;
5498 const XMMRegister x2 = xmm2;
5499 const XMMRegister x3 = xmm3;
5500
5501 const XMMRegister x4 = xmm4;
5502 const XMMRegister x5 = xmm5;
5503 const XMMRegister x6 = xmm6;
5504 const XMMRegister x7 = xmm7;
5505
5506 const Register tmp1 = r8;
5507 const Register tmp2 = r9;
5508 const Register tmp3 = r10;
5509 const Register tmp4 = r11;
5510
5511 BLOCK_COMMENT("Entry:");
5512 __ enter(); // required for proper stackwalking of RuntimeStub frame
5513
5514 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5515
5516 __ leave(); // required for proper stackwalking of RuntimeStub frame
5517 __ ret(0);
5518
5519 return start;
5520
5521 }
5522
5523 address generate_libmSin() {
5524 StubCodeMark mark(this, "StubRoutines", "libmSin");
5525
5526 address start = __ pc();
5527
5528 const XMMRegister x0 = xmm0;
5529 const XMMRegister x1 = xmm1;
5530 const XMMRegister x2 = xmm2;
5531 const XMMRegister x3 = xmm3;
5532
5533 const XMMRegister x4 = xmm4;
5534 const XMMRegister x5 = xmm5;
5535 const XMMRegister x6 = xmm6;
5536 const XMMRegister x7 = xmm7;
5537
5538 const Register tmp1 = r8;
5539 const Register tmp2 = r9;
5540 const Register tmp3 = r10;
5541 const Register tmp4 = r11;
5542
5543 BLOCK_COMMENT("Entry:");
5544 __ enter(); // required for proper stackwalking of RuntimeStub frame
5545
5546#ifdef _WIN64
5547 __ push(rsi);
5548 __ push(rdi);
5549#endif
5550 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5551
5552#ifdef _WIN64
5553 __ pop(rdi);
5554 __ pop(rsi);
5555#endif
5556
5557 __ leave(); // required for proper stackwalking of RuntimeStub frame
5558 __ ret(0);
5559
5560 return start;
5561
5562 }
5563
5564 address generate_libmCos() {
5565 StubCodeMark mark(this, "StubRoutines", "libmCos");
5566
5567 address start = __ pc();
5568
5569 const XMMRegister x0 = xmm0;
5570 const XMMRegister x1 = xmm1;
5571 const XMMRegister x2 = xmm2;
5572 const XMMRegister x3 = xmm3;
5573
5574 const XMMRegister x4 = xmm4;
5575 const XMMRegister x5 = xmm5;
5576 const XMMRegister x6 = xmm6;
5577 const XMMRegister x7 = xmm7;
5578
5579 const Register tmp1 = r8;
5580 const Register tmp2 = r9;
5581 const Register tmp3 = r10;
5582 const Register tmp4 = r11;
5583
5584 BLOCK_COMMENT("Entry:");
5585 __ enter(); // required for proper stackwalking of RuntimeStub frame
5586
5587#ifdef _WIN64
5588 __ push(rsi);
5589 __ push(rdi);
5590#endif
5591 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5592
5593#ifdef _WIN64
5594 __ pop(rdi);
5595 __ pop(rsi);
5596#endif
5597
5598 __ leave(); // required for proper stackwalking of RuntimeStub frame
5599 __ ret(0);
5600
5601 return start;
5602
5603 }
5604
5605 address generate_libmTan() {
5606 StubCodeMark mark(this, "StubRoutines", "libmTan");
5607
5608 address start = __ pc();
5609
5610 const XMMRegister x0 = xmm0;
5611 const XMMRegister x1 = xmm1;
5612 const XMMRegister x2 = xmm2;
5613 const XMMRegister x3 = xmm3;
5614
5615 const XMMRegister x4 = xmm4;
5616 const XMMRegister x5 = xmm5;
5617 const XMMRegister x6 = xmm6;
5618 const XMMRegister x7 = xmm7;
5619
5620 const Register tmp1 = r8;
5621 const Register tmp2 = r9;
5622 const Register tmp3 = r10;
5623 const Register tmp4 = r11;
5624
5625 BLOCK_COMMENT("Entry:");
5626 __ enter(); // required for proper stackwalking of RuntimeStub frame
5627
5628#ifdef _WIN64
5629 __ push(rsi);
5630 __ push(rdi);
5631#endif
5632 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5633
5634#ifdef _WIN64
5635 __ pop(rdi);
5636 __ pop(rsi);
5637#endif
5638
5639 __ leave(); // required for proper stackwalking of RuntimeStub frame
5640 __ ret(0);
5641
5642 return start;
5643
5644 }
5645
5646#undef __
5647#define __ masm->
5648
5649 // Continuation point for throwing of implicit exceptions that are
5650 // not handled in the current activation. Fabricates an exception
5651 // oop and initiates normal exception dispatching in this
5652 // frame. Since we need to preserve callee-saved values (currently
5653 // only for C2, but done for C1 as well) we need a callee-saved oop
5654 // map and therefore have to make these stubs into RuntimeStubs
5655 // rather than BufferBlobs. If the compiler needs all registers to
5656 // be preserved between the fault point and the exception handler
5657 // then it must assume responsibility for that in
5658 // AbstractCompiler::continuation_for_implicit_null_exception or
5659 // continuation_for_implicit_division_by_zero_exception. All other
5660 // implicit exceptions (e.g., NullPointerException or
5661 // AbstractMethodError on entry) are either at call sites or
5662 // otherwise assume that stack unwinding will be initiated, so
5663 // caller saved registers were assumed volatile in the compiler.
5664 address generate_throw_exception(const char* name,
5665 address runtime_entry,
5666 Register arg1 = noreg,
5667 Register arg2 = noreg) {
5668 // Information about frame layout at time of blocking runtime call.
5669 // Note that we only have to preserve callee-saved registers since
5670 // the compilers are responsible for supplying a continuation point
5671 // if they expect all registers to be preserved.
5672 enum layout {
5673 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5674 rbp_off2,
5675 return_off,
5676 return_off2,
5677 framesize // inclusive of return address
5678 };
5679
5680 int insts_size = 512;
5681 int locs_size = 64;
5682
5683 CodeBuffer code(name, insts_size, locs_size);
5684 OopMapSet* oop_maps = new OopMapSet();
5685 MacroAssembler* masm = new MacroAssembler(&code);
5686
5687 address start = __ pc();
5688
5689 // This is an inlined and slightly modified version of call_VM
5690 // which has the ability to fetch the return PC out of
5691 // thread-local storage and also sets up last_Java_sp slightly
5692 // differently than the real call_VM
5693
5694 __ enter(); // required for proper stackwalking of RuntimeStub frame
5695
5696 assert(is_even(framesize/2), "sp not 16-byte aligned");
5697
5698 // return address and rbp are already in place
5699 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5700
5701 int frame_complete = __ pc() - start;
5702
5703 // Set up last_Java_sp and last_Java_fp
5704 address the_pc = __ pc();
5705 __ set_last_Java_frame(rsp, rbp, the_pc);
5706 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
5707
5708 // Call runtime
5709 if (arg1 != noreg) {
5710 assert(arg2 != c_rarg1, "clobbered");
5711 __ movptr(c_rarg1, arg1);
5712 }
5713 if (arg2 != noreg) {
5714 __ movptr(c_rarg2, arg2);
5715 }
5716 __ movptr(c_rarg0, r15_thread);
5717 BLOCK_COMMENT("call runtime_entry");
5718 __ call(RuntimeAddress(runtime_entry));
5719
5720 // Generate oop map
5721 OopMap* map = new OopMap(framesize, 0);
5722
5723 oop_maps->add_gc_map(the_pc - start, map);
5724
5725 __ reset_last_Java_frame(true);
5726
5727 __ leave(); // required for proper stackwalking of RuntimeStub frame
5728
5729 // check for pending exceptions
5730#ifdef ASSERT
5731 Label L;
5732 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5733 (int32_t) NULL_WORD);
5734 __ jcc(Assembler::notEqual, L);
5735 __ should_not_reach_here();
5736 __ bind(L);
5737#endif // ASSERT
5738 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5739
5740
5741 // codeBlob framesize is in words (not VMRegImpl::slot_size)
5742 RuntimeStub* stub =
5743 RuntimeStub::new_runtime_stub(name,
5744 &code,
5745 frame_complete,
5746 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5747 oop_maps, false);
5748 return stub->entry_point();
5749 }
5750
5751 void create_control_words() {
5752 // Round to nearest, 53-bit mode, exceptions masked
5753 StubRoutines::_fpu_cntrl_wrd_std = 0x027F;
5754 // Round to zero, 53-bit mode, exception mased
5755 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5756 // Round to nearest, 24-bit mode, exceptions masked
5757 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F;
5758 // Round to nearest, 64-bit mode, exceptions masked
5759 StubRoutines::_mxcsr_std = 0x1F80;
5760 // Note: the following two constants are 80-bit values
5761 // layout is critical for correct loading by FPU.
5762 // Bias for strict fp multiply/divide
5763 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5764 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5765 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5766 // Un-Bias for strict fp multiply/divide
5767 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5768 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5769 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5770 }
5771
5772 // Initialization
5773 void generate_initial() {
5774 // Generates all stubs and initializes the entry points
5775
5776 // This platform-specific settings are needed by generate_call_stub()
5777 create_control_words();
5778
5779 // entry points that exist in all platforms Note: This is code
5780 // that could be shared among different platforms - however the
5781 // benefit seems to be smaller than the disadvantage of having a
5782 // much more complicated generator structure. See also comment in
5783 // stubRoutines.hpp.
5784
5785 StubRoutines::_forward_exception_entry = generate_forward_exception();
5786
5787 StubRoutines::_call_stub_entry =
5788 generate_call_stub(StubRoutines::_call_stub_return_address);
5789
5790 // is referenced by megamorphic call
5791 StubRoutines::_catch_exception_entry = generate_catch_exception();
5792
5793 // atomic calls
5794 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
5795 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long();
5796 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
5797 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte();
5798 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
5799 StubRoutines::_atomic_add_entry = generate_atomic_add();
5800 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long();
5801 StubRoutines::_fence_entry = generate_orderaccess_fence();
5802
5803 // platform dependent
5804 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5805 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5806
5807 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
5808
5809 // Build this early so it's available for the interpreter.
5810 StubRoutines::_throw_StackOverflowError_entry =
5811 generate_throw_exception("StackOverflowError throw_exception",
5812 CAST_FROM_FN_PTR(address,
5813 SharedRuntime::
5814 throw_StackOverflowError));
5815 StubRoutines::_throw_delayed_StackOverflowError_entry =
5816 generate_throw_exception("delayed StackOverflowError throw_exception",
5817 CAST_FROM_FN_PTR(address,
5818 SharedRuntime::
5819 throw_delayed_StackOverflowError));
5820 if (UseCRC32Intrinsics) {
5821 // set table address before stub generation which use it
5822 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5823 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5824 }
5825
5826 if (UseCRC32CIntrinsics) {
5827 bool supports_clmul = VM_Version::supports_clmul();
5828 StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5829 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5830 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5831 }
5832 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5833 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5834 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5835 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5836 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5837 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5838 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5839 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5840 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5841 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5842 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5843 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5844 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5845 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5846 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5847 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5848 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5849 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5850 }
5851 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5852 StubRoutines::_dexp = generate_libmExp();
5853 }
5854 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5855 StubRoutines::_dlog = generate_libmLog();
5856 }
5857 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5858 StubRoutines::_dlog10 = generate_libmLog10();
5859 }
5860 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5861 StubRoutines::_dpow = generate_libmPow();
5862 }
5863 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5864 StubRoutines::_dsin = generate_libmSin();
5865 }
5866 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5867 StubRoutines::_dcos = generate_libmCos();
5868 }
5869 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5870 StubRoutines::_dtan = generate_libmTan();
5871 }
5872 }
5873 }
5874
5875 void generate_all() {
5876 // Generates all stubs and initializes the entry points
5877
5878 // These entry points require SharedInfo::stack0 to be set up in
5879 // non-core builds and need to be relocatable, so they each
5880 // fabricate a RuntimeStub internally.
5881 StubRoutines::_throw_AbstractMethodError_entry =
5882 generate_throw_exception("AbstractMethodError throw_exception",
5883 CAST_FROM_FN_PTR(address,
5884 SharedRuntime::
5885 throw_AbstractMethodError));
5886
5887 StubRoutines::_throw_IncompatibleClassChangeError_entry =
5888 generate_throw_exception("IncompatibleClassChangeError throw_exception",
5889 CAST_FROM_FN_PTR(address,
5890 SharedRuntime::
5891 throw_IncompatibleClassChangeError));
5892
5893 StubRoutines::_throw_NullPointerException_at_call_entry =
5894 generate_throw_exception("NullPointerException at call throw_exception",
5895 CAST_FROM_FN_PTR(address,
5896 SharedRuntime::
5897 throw_NullPointerException_at_call));
5898
5899 // entry points that are platform specific
5900 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5901 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5902 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5903 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5904
5905 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
5906 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
5907 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5908 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5909 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
5910 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
5911 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5912 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
5913 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
5914 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
5915 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
5916
5917 // support for verify_oop (must happen after universe_init)
5918 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5919
5920 // arraycopy stubs used by compilers
5921 generate_arraycopy_stubs();
5922
5923 // don't bother generating these AES intrinsic stubs unless global flag is set
5924 if (UseAESIntrinsics) {
5925 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
5926 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5927 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5928 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5929 if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
5930 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
5931 } else {
5932 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5933 }
5934 }
5935 if (UseAESCTRIntrinsics){
5936 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5937 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5938 }
5939
5940 if (UseSHA1Intrinsics) {
5941 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5942 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5943 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5944 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5945 }
5946 if (UseSHA256Intrinsics) {
5947 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5948 char* dst = (char*)StubRoutines::x86::_k256_W;
5949 char* src = (char*)StubRoutines::x86::_k256;
5950 for (int ii = 0; ii < 16; ++ii) {
5951 memcpy(dst + 32 * ii, src + 16 * ii, 16);
5952 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5953 }
5954 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5955 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5956 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5957 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5958 }
5959 if (UseSHA512Intrinsics) {
5960 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5961 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5962 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5963 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5964 }
5965
5966 // Generate GHASH intrinsics code
5967 if (UseGHASHIntrinsics) {
5968 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5969 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5970 if (VM_Version::supports_avx()) {
5971 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
5972 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
5973 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
5974 } else {
5975 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5976 }
5977 }
5978
5979 if (UseBASE64Intrinsics) {
5980 StubRoutines::x86::_and_mask = base64_and_mask_addr();
5981 StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
5982 StubRoutines::x86::_base64_charset = base64_charset_addr();
5983 StubRoutines::x86::_url_charset = base64url_charset_addr();
5984 StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
5985 StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
5986 StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
5987 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5988 }
5989
5990 // Safefetch stubs.
5991 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
5992 &StubRoutines::_safefetch32_fault_pc,
5993 &StubRoutines::_safefetch32_continuation_pc);
5994 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5995 &StubRoutines::_safefetchN_fault_pc,
5996 &StubRoutines::_safefetchN_continuation_pc);
5997
5998 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5999 if (bs_nm != NULL) {
6000 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6001 }
6002#ifdef COMPILER2
6003 if (UseMultiplyToLenIntrinsic) {
6004 StubRoutines::_multiplyToLen = generate_multiplyToLen();
6005 }
6006 if (UseSquareToLenIntrinsic) {
6007 StubRoutines::_squareToLen = generate_squareToLen();
6008 }
6009 if (UseMulAddIntrinsic) {
6010 StubRoutines::_mulAdd = generate_mulAdd();
6011 }
6012#ifndef _WINDOWS
6013 if (UseMontgomeryMultiplyIntrinsic) {
6014 StubRoutines::_montgomeryMultiply
6015 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6016 }
6017 if (UseMontgomerySquareIntrinsic) {
6018 StubRoutines::_montgomerySquare
6019 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6020 }
6021#endif // WINDOWS
6022#endif // COMPILER2
6023
6024 if (UseVectorizedMismatchIntrinsic) {
6025 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6026 }
6027 }
6028
6029 public:
6030 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6031 if (all) {
6032 generate_all();
6033 } else {
6034 generate_initial();
6035 }
6036 }
6037}; // end class declaration
6038
6039void StubGenerator_generate(CodeBuffer* code, bool all) {
6040 StubGenerator g(code, all);
6041}
6042