1 | /* |
2 | * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #include "precompiled.hpp" |
26 | #include "asm/macroAssembler.hpp" |
27 | #include "asm/macroAssembler.inline.hpp" |
28 | #include "ci/ciUtilities.hpp" |
29 | #include "gc/shared/barrierSet.hpp" |
30 | #include "gc/shared/barrierSetAssembler.hpp" |
31 | #include "gc/shared/barrierSetNMethod.hpp" |
32 | #include "interpreter/interpreter.hpp" |
33 | #include "memory/universe.hpp" |
34 | #include "nativeInst_x86.hpp" |
35 | #include "oops/instanceOop.hpp" |
36 | #include "oops/method.hpp" |
37 | #include "oops/objArrayKlass.hpp" |
38 | #include "oops/oop.inline.hpp" |
39 | #include "prims/methodHandles.hpp" |
40 | #include "runtime/frame.inline.hpp" |
41 | #include "runtime/handles.inline.hpp" |
42 | #include "runtime/sharedRuntime.hpp" |
43 | #include "runtime/stubCodeGenerator.hpp" |
44 | #include "runtime/stubRoutines.hpp" |
45 | #include "runtime/thread.inline.hpp" |
46 | #ifdef COMPILER2 |
47 | #include "opto/runtime.hpp" |
48 | #endif |
49 | #if INCLUDE_ZGC |
50 | #include "gc/z/zThreadLocalData.hpp" |
51 | #endif |
52 | |
53 | // Declaration and definition of StubGenerator (no .hpp file). |
54 | // For a more detailed description of the stub routine structure |
55 | // see the comment in stubRoutines.hpp |
56 | |
57 | #define __ _masm-> |
58 | #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) |
59 | #define a__ ((Assembler*)_masm)-> |
60 | |
61 | #ifdef PRODUCT |
62 | #define (str) /* nothing */ |
63 | #else |
64 | #define BLOCK_COMMENT(str) __ block_comment(str) |
65 | #endif |
66 | |
67 | #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") |
68 | const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions |
69 | |
70 | // Stub Code definitions |
71 | |
72 | class StubGenerator: public StubCodeGenerator { |
73 | private: |
74 | |
75 | #ifdef PRODUCT |
76 | #define inc_counter_np(counter) ((void)0) |
77 | #else |
78 | void inc_counter_np_(int& counter) { |
79 | // This can destroy rscratch1 if counter is far from the code cache |
80 | __ incrementl(ExternalAddress((address)&counter)); |
81 | } |
82 | #define inc_counter_np(counter) \ |
83 | BLOCK_COMMENT("inc_counter " #counter); \ |
84 | inc_counter_np_(counter); |
85 | #endif |
86 | |
87 | // Call stubs are used to call Java from C |
88 | // |
89 | // Linux Arguments: |
90 | // c_rarg0: call wrapper address address |
91 | // c_rarg1: result address |
92 | // c_rarg2: result type BasicType |
93 | // c_rarg3: method Method* |
94 | // c_rarg4: (interpreter) entry point address |
95 | // c_rarg5: parameters intptr_t* |
96 | // 16(rbp): parameter size (in words) int |
97 | // 24(rbp): thread Thread* |
98 | // |
99 | // [ return_from_Java ] <--- rsp |
100 | // [ argument word n ] |
101 | // ... |
102 | // -12 [ argument word 1 ] |
103 | // -11 [ saved r15 ] <--- rsp_after_call |
104 | // -10 [ saved r14 ] |
105 | // -9 [ saved r13 ] |
106 | // -8 [ saved r12 ] |
107 | // -7 [ saved rbx ] |
108 | // -6 [ call wrapper ] |
109 | // -5 [ result ] |
110 | // -4 [ result type ] |
111 | // -3 [ method ] |
112 | // -2 [ entry point ] |
113 | // -1 [ parameters ] |
114 | // 0 [ saved rbp ] <--- rbp |
115 | // 1 [ return address ] |
116 | // 2 [ parameter size ] |
117 | // 3 [ thread ] |
118 | // |
119 | // Windows Arguments: |
120 | // c_rarg0: call wrapper address address |
121 | // c_rarg1: result address |
122 | // c_rarg2: result type BasicType |
123 | // c_rarg3: method Method* |
124 | // 48(rbp): (interpreter) entry point address |
125 | // 56(rbp): parameters intptr_t* |
126 | // 64(rbp): parameter size (in words) int |
127 | // 72(rbp): thread Thread* |
128 | // |
129 | // [ return_from_Java ] <--- rsp |
130 | // [ argument word n ] |
131 | // ... |
132 | // -60 [ argument word 1 ] |
133 | // -59 [ saved xmm31 ] <--- rsp after_call |
134 | // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank) |
135 | // -27 [ saved xmm15 ] |
136 | // [ saved xmm7-xmm14 ] |
137 | // -9 [ saved xmm6 ] (each xmm register takes 2 slots) |
138 | // -7 [ saved r15 ] |
139 | // -6 [ saved r14 ] |
140 | // -5 [ saved r13 ] |
141 | // -4 [ saved r12 ] |
142 | // -3 [ saved rdi ] |
143 | // -2 [ saved rsi ] |
144 | // -1 [ saved rbx ] |
145 | // 0 [ saved rbp ] <--- rbp |
146 | // 1 [ return address ] |
147 | // 2 [ call wrapper ] |
148 | // 3 [ result ] |
149 | // 4 [ result type ] |
150 | // 5 [ method ] |
151 | // 6 [ entry point ] |
152 | // 7 [ parameters ] |
153 | // 8 [ parameter size ] |
154 | // 9 [ thread ] |
155 | // |
156 | // Windows reserves the callers stack space for arguments 1-4. |
157 | // We spill c_rarg0-c_rarg3 to this space. |
158 | |
159 | // Call stub stack layout word offsets from rbp |
160 | enum call_stub_layout { |
161 | #ifdef _WIN64 |
162 | xmm_save_first = 6, // save from xmm6 |
163 | xmm_save_last = 31, // to xmm31 |
164 | xmm_save_base = -9, |
165 | rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27 |
166 | r15_off = -7, |
167 | r14_off = -6, |
168 | r13_off = -5, |
169 | r12_off = -4, |
170 | rdi_off = -3, |
171 | rsi_off = -2, |
172 | rbx_off = -1, |
173 | rbp_off = 0, |
174 | retaddr_off = 1, |
175 | call_wrapper_off = 2, |
176 | result_off = 3, |
177 | result_type_off = 4, |
178 | method_off = 5, |
179 | entry_point_off = 6, |
180 | parameters_off = 7, |
181 | parameter_size_off = 8, |
182 | thread_off = 9 |
183 | #else |
184 | rsp_after_call_off = -12, |
185 | mxcsr_off = rsp_after_call_off, |
186 | r15_off = -11, |
187 | r14_off = -10, |
188 | r13_off = -9, |
189 | r12_off = -8, |
190 | rbx_off = -7, |
191 | call_wrapper_off = -6, |
192 | result_off = -5, |
193 | result_type_off = -4, |
194 | method_off = -3, |
195 | entry_point_off = -2, |
196 | parameters_off = -1, |
197 | rbp_off = 0, |
198 | retaddr_off = 1, |
199 | parameter_size_off = 2, |
200 | thread_off = 3 |
201 | #endif |
202 | }; |
203 | |
204 | #ifdef _WIN64 |
205 | Address xmm_save(int reg) { |
206 | assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range" ); |
207 | return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize); |
208 | } |
209 | #endif |
210 | |
211 | address generate_call_stub(address& return_address) { |
212 | assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 && |
213 | (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, |
214 | "adjust this code" ); |
215 | StubCodeMark mark(this, "StubRoutines" , "call_stub" ); |
216 | address start = __ pc(); |
217 | |
218 | // same as in generate_catch_exception()! |
219 | const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); |
220 | |
221 | const Address call_wrapper (rbp, call_wrapper_off * wordSize); |
222 | const Address result (rbp, result_off * wordSize); |
223 | const Address result_type (rbp, result_type_off * wordSize); |
224 | const Address method (rbp, method_off * wordSize); |
225 | const Address entry_point (rbp, entry_point_off * wordSize); |
226 | const Address parameters (rbp, parameters_off * wordSize); |
227 | const Address parameter_size(rbp, parameter_size_off * wordSize); |
228 | |
229 | // same as in generate_catch_exception()! |
230 | const Address thread (rbp, thread_off * wordSize); |
231 | |
232 | const Address r15_save(rbp, r15_off * wordSize); |
233 | const Address r14_save(rbp, r14_off * wordSize); |
234 | const Address r13_save(rbp, r13_off * wordSize); |
235 | const Address r12_save(rbp, r12_off * wordSize); |
236 | const Address rbx_save(rbp, rbx_off * wordSize); |
237 | |
238 | // stub code |
239 | __ enter(); |
240 | __ subptr(rsp, -rsp_after_call_off * wordSize); |
241 | |
242 | // save register parameters |
243 | #ifndef _WIN64 |
244 | __ movptr(parameters, c_rarg5); // parameters |
245 | __ movptr(entry_point, c_rarg4); // entry_point |
246 | #endif |
247 | |
248 | __ movptr(method, c_rarg3); // method |
249 | __ movl(result_type, c_rarg2); // result type |
250 | __ movptr(result, c_rarg1); // result |
251 | __ movptr(call_wrapper, c_rarg0); // call wrapper |
252 | |
253 | // save regs belonging to calling function |
254 | __ movptr(rbx_save, rbx); |
255 | __ movptr(r12_save, r12); |
256 | __ movptr(r13_save, r13); |
257 | __ movptr(r14_save, r14); |
258 | __ movptr(r15_save, r15); |
259 | |
260 | #ifdef _WIN64 |
261 | int last_reg = 15; |
262 | if (UseAVX > 2) { |
263 | last_reg = 31; |
264 | } |
265 | if (VM_Version::supports_evex()) { |
266 | for (int i = xmm_save_first; i <= last_reg; i++) { |
267 | __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0); |
268 | } |
269 | } else { |
270 | for (int i = xmm_save_first; i <= last_reg; i++) { |
271 | __ movdqu(xmm_save(i), as_XMMRegister(i)); |
272 | } |
273 | } |
274 | |
275 | const Address rdi_save(rbp, rdi_off * wordSize); |
276 | const Address rsi_save(rbp, rsi_off * wordSize); |
277 | |
278 | __ movptr(rsi_save, rsi); |
279 | __ movptr(rdi_save, rdi); |
280 | #else |
281 | const Address mxcsr_save(rbp, mxcsr_off * wordSize); |
282 | { |
283 | Label skip_ldmx; |
284 | __ stmxcsr(mxcsr_save); |
285 | __ movl(rax, mxcsr_save); |
286 | __ andl(rax, MXCSR_MASK); // Only check control and mask bits |
287 | ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); |
288 | __ cmp32(rax, mxcsr_std); |
289 | __ jcc(Assembler::equal, skip_ldmx); |
290 | __ ldmxcsr(mxcsr_std); |
291 | __ bind(skip_ldmx); |
292 | } |
293 | #endif |
294 | |
295 | // Load up thread register |
296 | __ movptr(r15_thread, thread); |
297 | __ reinit_heapbase(); |
298 | |
299 | #ifdef ASSERT |
300 | // make sure we have no pending exceptions |
301 | { |
302 | Label L; |
303 | __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); |
304 | __ jcc(Assembler::equal, L); |
305 | __ stop("StubRoutines::call_stub: entered with pending exception" ); |
306 | __ bind(L); |
307 | } |
308 | #endif |
309 | |
310 | // pass parameters if any |
311 | BLOCK_COMMENT("pass parameters if any" ); |
312 | Label parameters_done; |
313 | __ movl(c_rarg3, parameter_size); |
314 | __ testl(c_rarg3, c_rarg3); |
315 | __ jcc(Assembler::zero, parameters_done); |
316 | |
317 | Label loop; |
318 | __ movptr(c_rarg2, parameters); // parameter pointer |
319 | __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1 |
320 | __ BIND(loop); |
321 | __ movptr(rax, Address(c_rarg2, 0));// get parameter |
322 | __ addptr(c_rarg2, wordSize); // advance to next parameter |
323 | __ decrementl(c_rarg1); // decrement counter |
324 | __ push(rax); // pass parameter |
325 | __ jcc(Assembler::notZero, loop); |
326 | |
327 | // call Java function |
328 | __ BIND(parameters_done); |
329 | __ movptr(rbx, method); // get Method* |
330 | __ movptr(c_rarg1, entry_point); // get entry_point |
331 | __ mov(r13, rsp); // set sender sp |
332 | BLOCK_COMMENT("call Java function" ); |
333 | __ call(c_rarg1); |
334 | |
335 | BLOCK_COMMENT("call_stub_return_address:" ); |
336 | return_address = __ pc(); |
337 | |
338 | // store result depending on type (everything that is not |
339 | // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) |
340 | __ movptr(c_rarg0, result); |
341 | Label is_long, is_float, is_double, exit; |
342 | __ movl(c_rarg1, result_type); |
343 | __ cmpl(c_rarg1, T_OBJECT); |
344 | __ jcc(Assembler::equal, is_long); |
345 | __ cmpl(c_rarg1, T_LONG); |
346 | __ jcc(Assembler::equal, is_long); |
347 | __ cmpl(c_rarg1, T_FLOAT); |
348 | __ jcc(Assembler::equal, is_float); |
349 | __ cmpl(c_rarg1, T_DOUBLE); |
350 | __ jcc(Assembler::equal, is_double); |
351 | |
352 | // handle T_INT case |
353 | __ movl(Address(c_rarg0, 0), rax); |
354 | |
355 | __ BIND(exit); |
356 | |
357 | // pop parameters |
358 | __ lea(rsp, rsp_after_call); |
359 | |
360 | #ifdef ASSERT |
361 | // verify that threads correspond |
362 | { |
363 | Label L1, L2, L3; |
364 | __ cmpptr(r15_thread, thread); |
365 | __ jcc(Assembler::equal, L1); |
366 | __ stop("StubRoutines::call_stub: r15_thread is corrupted" ); |
367 | __ bind(L1); |
368 | __ get_thread(rbx); |
369 | __ cmpptr(r15_thread, thread); |
370 | __ jcc(Assembler::equal, L2); |
371 | __ stop("StubRoutines::call_stub: r15_thread is modified by call" ); |
372 | __ bind(L2); |
373 | __ cmpptr(r15_thread, rbx); |
374 | __ jcc(Assembler::equal, L3); |
375 | __ stop("StubRoutines::call_stub: threads must correspond" ); |
376 | __ bind(L3); |
377 | } |
378 | #endif |
379 | |
380 | // restore regs belonging to calling function |
381 | #ifdef _WIN64 |
382 | // emit the restores for xmm regs |
383 | if (VM_Version::supports_evex()) { |
384 | for (int i = xmm_save_first; i <= last_reg; i++) { |
385 | __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0); |
386 | } |
387 | } else { |
388 | for (int i = xmm_save_first; i <= last_reg; i++) { |
389 | __ movdqu(as_XMMRegister(i), xmm_save(i)); |
390 | } |
391 | } |
392 | #endif |
393 | __ movptr(r15, r15_save); |
394 | __ movptr(r14, r14_save); |
395 | __ movptr(r13, r13_save); |
396 | __ movptr(r12, r12_save); |
397 | __ movptr(rbx, rbx_save); |
398 | |
399 | #ifdef _WIN64 |
400 | __ movptr(rdi, rdi_save); |
401 | __ movptr(rsi, rsi_save); |
402 | #else |
403 | __ ldmxcsr(mxcsr_save); |
404 | #endif |
405 | |
406 | // restore rsp |
407 | __ addptr(rsp, -rsp_after_call_off * wordSize); |
408 | |
409 | // return |
410 | __ vzeroupper(); |
411 | __ pop(rbp); |
412 | __ ret(0); |
413 | |
414 | // handle return types different from T_INT |
415 | __ BIND(is_long); |
416 | __ movq(Address(c_rarg0, 0), rax); |
417 | __ jmp(exit); |
418 | |
419 | __ BIND(is_float); |
420 | __ movflt(Address(c_rarg0, 0), xmm0); |
421 | __ jmp(exit); |
422 | |
423 | __ BIND(is_double); |
424 | __ movdbl(Address(c_rarg0, 0), xmm0); |
425 | __ jmp(exit); |
426 | |
427 | return start; |
428 | } |
429 | |
430 | // Return point for a Java call if there's an exception thrown in |
431 | // Java code. The exception is caught and transformed into a |
432 | // pending exception stored in JavaThread that can be tested from |
433 | // within the VM. |
434 | // |
435 | // Note: Usually the parameters are removed by the callee. In case |
436 | // of an exception crossing an activation frame boundary, that is |
437 | // not the case if the callee is compiled code => need to setup the |
438 | // rsp. |
439 | // |
440 | // rax: exception oop |
441 | |
442 | address generate_catch_exception() { |
443 | StubCodeMark mark(this, "StubRoutines" , "catch_exception" ); |
444 | address start = __ pc(); |
445 | |
446 | // same as in generate_call_stub(): |
447 | const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); |
448 | const Address thread (rbp, thread_off * wordSize); |
449 | |
450 | #ifdef ASSERT |
451 | // verify that threads correspond |
452 | { |
453 | Label L1, L2, L3; |
454 | __ cmpptr(r15_thread, thread); |
455 | __ jcc(Assembler::equal, L1); |
456 | __ stop("StubRoutines::catch_exception: r15_thread is corrupted" ); |
457 | __ bind(L1); |
458 | __ get_thread(rbx); |
459 | __ cmpptr(r15_thread, thread); |
460 | __ jcc(Assembler::equal, L2); |
461 | __ stop("StubRoutines::catch_exception: r15_thread is modified by call" ); |
462 | __ bind(L2); |
463 | __ cmpptr(r15_thread, rbx); |
464 | __ jcc(Assembler::equal, L3); |
465 | __ stop("StubRoutines::catch_exception: threads must correspond" ); |
466 | __ bind(L3); |
467 | } |
468 | #endif |
469 | |
470 | // set pending exception |
471 | __ verify_oop(rax); |
472 | |
473 | __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax); |
474 | __ lea(rscratch1, ExternalAddress((address)__FILE__)); |
475 | __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1); |
476 | __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__); |
477 | |
478 | // complete return to VM |
479 | assert(StubRoutines::_call_stub_return_address != NULL, |
480 | "_call_stub_return_address must have been generated before" ); |
481 | __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address)); |
482 | |
483 | return start; |
484 | } |
485 | |
486 | // Continuation point for runtime calls returning with a pending |
487 | // exception. The pending exception check happened in the runtime |
488 | // or native call stub. The pending exception in Thread is |
489 | // converted into a Java-level exception. |
490 | // |
491 | // Contract with Java-level exception handlers: |
492 | // rax: exception |
493 | // rdx: throwing pc |
494 | // |
495 | // NOTE: At entry of this stub, exception-pc must be on stack !! |
496 | |
497 | address generate_forward_exception() { |
498 | StubCodeMark mark(this, "StubRoutines" , "forward exception" ); |
499 | address start = __ pc(); |
500 | |
501 | // Upon entry, the sp points to the return address returning into |
502 | // Java (interpreted or compiled) code; i.e., the return address |
503 | // becomes the throwing pc. |
504 | // |
505 | // Arguments pushed before the runtime call are still on the stack |
506 | // but the exception handler will reset the stack pointer -> |
507 | // ignore them. A potential result in registers can be ignored as |
508 | // well. |
509 | |
510 | #ifdef ASSERT |
511 | // make sure this code is only executed if there is a pending exception |
512 | { |
513 | Label L; |
514 | __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL); |
515 | __ jcc(Assembler::notEqual, L); |
516 | __ stop("StubRoutines::forward exception: no pending exception (1)" ); |
517 | __ bind(L); |
518 | } |
519 | #endif |
520 | |
521 | // compute exception handler into rbx |
522 | __ movptr(c_rarg0, Address(rsp, 0)); |
523 | BLOCK_COMMENT("call exception_handler_for_return_address" ); |
524 | __ call_VM_leaf(CAST_FROM_FN_PTR(address, |
525 | SharedRuntime::exception_handler_for_return_address), |
526 | r15_thread, c_rarg0); |
527 | __ mov(rbx, rax); |
528 | |
529 | // setup rax & rdx, remove return address & clear pending exception |
530 | __ pop(rdx); |
531 | __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); |
532 | __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); |
533 | |
534 | #ifdef ASSERT |
535 | // make sure exception is set |
536 | { |
537 | Label L; |
538 | __ testptr(rax, rax); |
539 | __ jcc(Assembler::notEqual, L); |
540 | __ stop("StubRoutines::forward exception: no pending exception (2)" ); |
541 | __ bind(L); |
542 | } |
543 | #endif |
544 | |
545 | // continue at exception handler (return address removed) |
546 | // rax: exception |
547 | // rbx: exception handler |
548 | // rdx: throwing pc |
549 | __ verify_oop(rax); |
550 | __ jmp(rbx); |
551 | |
552 | return start; |
553 | } |
554 | |
555 | // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest) |
556 | // |
557 | // Arguments : |
558 | // c_rarg0: exchange_value |
559 | // c_rarg0: dest |
560 | // |
561 | // Result: |
562 | // *dest <- ex, return (orig *dest) |
563 | address generate_atomic_xchg() { |
564 | StubCodeMark mark(this, "StubRoutines" , "atomic_xchg" ); |
565 | address start = __ pc(); |
566 | |
567 | __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow |
568 | __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK |
569 | __ ret(0); |
570 | |
571 | return start; |
572 | } |
573 | |
574 | // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest) |
575 | // |
576 | // Arguments : |
577 | // c_rarg0: exchange_value |
578 | // c_rarg1: dest |
579 | // |
580 | // Result: |
581 | // *dest <- ex, return (orig *dest) |
582 | address generate_atomic_xchg_long() { |
583 | StubCodeMark mark(this, "StubRoutines" , "atomic_xchg_long" ); |
584 | address start = __ pc(); |
585 | |
586 | __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow |
587 | __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK |
588 | __ ret(0); |
589 | |
590 | return start; |
591 | } |
592 | |
593 | // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest, |
594 | // jint compare_value) |
595 | // |
596 | // Arguments : |
597 | // c_rarg0: exchange_value |
598 | // c_rarg1: dest |
599 | // c_rarg2: compare_value |
600 | // |
601 | // Result: |
602 | // if ( compare_value == *dest ) { |
603 | // *dest = exchange_value |
604 | // return compare_value; |
605 | // else |
606 | // return *dest; |
607 | address generate_atomic_cmpxchg() { |
608 | StubCodeMark mark(this, "StubRoutines" , "atomic_cmpxchg" ); |
609 | address start = __ pc(); |
610 | |
611 | __ movl(rax, c_rarg2); |
612 | __ lock(); |
613 | __ cmpxchgl(c_rarg0, Address(c_rarg1, 0)); |
614 | __ ret(0); |
615 | |
616 | return start; |
617 | } |
618 | |
619 | // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest, |
620 | // int8_t compare_value) |
621 | // |
622 | // Arguments : |
623 | // c_rarg0: exchange_value |
624 | // c_rarg1: dest |
625 | // c_rarg2: compare_value |
626 | // |
627 | // Result: |
628 | // if ( compare_value == *dest ) { |
629 | // *dest = exchange_value |
630 | // return compare_value; |
631 | // else |
632 | // return *dest; |
633 | address generate_atomic_cmpxchg_byte() { |
634 | StubCodeMark mark(this, "StubRoutines" , "atomic_cmpxchg_byte" ); |
635 | address start = __ pc(); |
636 | |
637 | __ movsbq(rax, c_rarg2); |
638 | __ lock(); |
639 | __ cmpxchgb(c_rarg0, Address(c_rarg1, 0)); |
640 | __ ret(0); |
641 | |
642 | return start; |
643 | } |
644 | |
645 | // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value, |
646 | // volatile int64_t* dest, |
647 | // int64_t compare_value) |
648 | // Arguments : |
649 | // c_rarg0: exchange_value |
650 | // c_rarg1: dest |
651 | // c_rarg2: compare_value |
652 | // |
653 | // Result: |
654 | // if ( compare_value == *dest ) { |
655 | // *dest = exchange_value |
656 | // return compare_value; |
657 | // else |
658 | // return *dest; |
659 | address generate_atomic_cmpxchg_long() { |
660 | StubCodeMark mark(this, "StubRoutines" , "atomic_cmpxchg_long" ); |
661 | address start = __ pc(); |
662 | |
663 | __ movq(rax, c_rarg2); |
664 | __ lock(); |
665 | __ cmpxchgq(c_rarg0, Address(c_rarg1, 0)); |
666 | __ ret(0); |
667 | |
668 | return start; |
669 | } |
670 | |
671 | // Support for jint atomic::add(jint add_value, volatile jint* dest) |
672 | // |
673 | // Arguments : |
674 | // c_rarg0: add_value |
675 | // c_rarg1: dest |
676 | // |
677 | // Result: |
678 | // *dest += add_value |
679 | // return *dest; |
680 | address generate_atomic_add() { |
681 | StubCodeMark mark(this, "StubRoutines" , "atomic_add" ); |
682 | address start = __ pc(); |
683 | |
684 | __ movl(rax, c_rarg0); |
685 | __ lock(); |
686 | __ xaddl(Address(c_rarg1, 0), c_rarg0); |
687 | __ addl(rax, c_rarg0); |
688 | __ ret(0); |
689 | |
690 | return start; |
691 | } |
692 | |
693 | // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest) |
694 | // |
695 | // Arguments : |
696 | // c_rarg0: add_value |
697 | // c_rarg1: dest |
698 | // |
699 | // Result: |
700 | // *dest += add_value |
701 | // return *dest; |
702 | address generate_atomic_add_long() { |
703 | StubCodeMark mark(this, "StubRoutines" , "atomic_add_long" ); |
704 | address start = __ pc(); |
705 | |
706 | __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow |
707 | __ lock(); |
708 | __ xaddptr(Address(c_rarg1, 0), c_rarg0); |
709 | __ addptr(rax, c_rarg0); |
710 | __ ret(0); |
711 | |
712 | return start; |
713 | } |
714 | |
715 | // Support for intptr_t OrderAccess::fence() |
716 | // |
717 | // Arguments : |
718 | // |
719 | // Result: |
720 | address generate_orderaccess_fence() { |
721 | StubCodeMark mark(this, "StubRoutines" , "orderaccess_fence" ); |
722 | address start = __ pc(); |
723 | __ membar(Assembler::StoreLoad); |
724 | __ ret(0); |
725 | |
726 | return start; |
727 | } |
728 | |
729 | // Support for intptr_t get_previous_fp() |
730 | // |
731 | // This routine is used to find the previous frame pointer for the |
732 | // caller (current_frame_guess). This is used as part of debugging |
733 | // ps() is seemingly lost trying to find frames. |
734 | // This code assumes that caller current_frame_guess) has a frame. |
735 | address generate_get_previous_fp() { |
736 | StubCodeMark mark(this, "StubRoutines" , "get_previous_fp" ); |
737 | const Address old_fp(rbp, 0); |
738 | const Address older_fp(rax, 0); |
739 | address start = __ pc(); |
740 | |
741 | __ enter(); |
742 | __ movptr(rax, old_fp); // callers fp |
743 | __ movptr(rax, older_fp); // the frame for ps() |
744 | __ pop(rbp); |
745 | __ ret(0); |
746 | |
747 | return start; |
748 | } |
749 | |
750 | // Support for intptr_t get_previous_sp() |
751 | // |
752 | // This routine is used to find the previous stack pointer for the |
753 | // caller. |
754 | address generate_get_previous_sp() { |
755 | StubCodeMark mark(this, "StubRoutines" , "get_previous_sp" ); |
756 | address start = __ pc(); |
757 | |
758 | __ movptr(rax, rsp); |
759 | __ addptr(rax, 8); // return address is at the top of the stack. |
760 | __ ret(0); |
761 | |
762 | return start; |
763 | } |
764 | |
765 | //---------------------------------------------------------------------------------------------------- |
766 | // Support for void verify_mxcsr() |
767 | // |
768 | // This routine is used with -Xcheck:jni to verify that native |
769 | // JNI code does not return to Java code without restoring the |
770 | // MXCSR register to our expected state. |
771 | |
772 | address generate_verify_mxcsr() { |
773 | StubCodeMark mark(this, "StubRoutines" , "verify_mxcsr" ); |
774 | address start = __ pc(); |
775 | |
776 | const Address mxcsr_save(rsp, 0); |
777 | |
778 | if (CheckJNICalls) { |
779 | Label ok_ret; |
780 | ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); |
781 | __ push(rax); |
782 | __ subptr(rsp, wordSize); // allocate a temp location |
783 | __ stmxcsr(mxcsr_save); |
784 | __ movl(rax, mxcsr_save); |
785 | __ andl(rax, MXCSR_MASK); // Only check control and mask bits |
786 | __ cmp32(rax, mxcsr_std); |
787 | __ jcc(Assembler::equal, ok_ret); |
788 | |
789 | __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall" ); |
790 | |
791 | __ ldmxcsr(mxcsr_std); |
792 | |
793 | __ bind(ok_ret); |
794 | __ addptr(rsp, wordSize); |
795 | __ pop(rax); |
796 | } |
797 | |
798 | __ ret(0); |
799 | |
800 | return start; |
801 | } |
802 | |
803 | address generate_f2i_fixup() { |
804 | StubCodeMark mark(this, "StubRoutines" , "f2i_fixup" ); |
805 | Address inout(rsp, 5 * wordSize); // return address + 4 saves |
806 | |
807 | address start = __ pc(); |
808 | |
809 | Label L; |
810 | |
811 | __ push(rax); |
812 | __ push(c_rarg3); |
813 | __ push(c_rarg2); |
814 | __ push(c_rarg1); |
815 | |
816 | __ movl(rax, 0x7f800000); |
817 | __ xorl(c_rarg3, c_rarg3); |
818 | __ movl(c_rarg2, inout); |
819 | __ movl(c_rarg1, c_rarg2); |
820 | __ andl(c_rarg1, 0x7fffffff); |
821 | __ cmpl(rax, c_rarg1); // NaN? -> 0 |
822 | __ jcc(Assembler::negative, L); |
823 | __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint |
824 | __ movl(c_rarg3, 0x80000000); |
825 | __ movl(rax, 0x7fffffff); |
826 | __ cmovl(Assembler::positive, c_rarg3, rax); |
827 | |
828 | __ bind(L); |
829 | __ movptr(inout, c_rarg3); |
830 | |
831 | __ pop(c_rarg1); |
832 | __ pop(c_rarg2); |
833 | __ pop(c_rarg3); |
834 | __ pop(rax); |
835 | |
836 | __ ret(0); |
837 | |
838 | return start; |
839 | } |
840 | |
841 | address generate_f2l_fixup() { |
842 | StubCodeMark mark(this, "StubRoutines" , "f2l_fixup" ); |
843 | Address inout(rsp, 5 * wordSize); // return address + 4 saves |
844 | address start = __ pc(); |
845 | |
846 | Label L; |
847 | |
848 | __ push(rax); |
849 | __ push(c_rarg3); |
850 | __ push(c_rarg2); |
851 | __ push(c_rarg1); |
852 | |
853 | __ movl(rax, 0x7f800000); |
854 | __ xorl(c_rarg3, c_rarg3); |
855 | __ movl(c_rarg2, inout); |
856 | __ movl(c_rarg1, c_rarg2); |
857 | __ andl(c_rarg1, 0x7fffffff); |
858 | __ cmpl(rax, c_rarg1); // NaN? -> 0 |
859 | __ jcc(Assembler::negative, L); |
860 | __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong |
861 | __ mov64(c_rarg3, 0x8000000000000000); |
862 | __ mov64(rax, 0x7fffffffffffffff); |
863 | __ cmov(Assembler::positive, c_rarg3, rax); |
864 | |
865 | __ bind(L); |
866 | __ movptr(inout, c_rarg3); |
867 | |
868 | __ pop(c_rarg1); |
869 | __ pop(c_rarg2); |
870 | __ pop(c_rarg3); |
871 | __ pop(rax); |
872 | |
873 | __ ret(0); |
874 | |
875 | return start; |
876 | } |
877 | |
878 | address generate_d2i_fixup() { |
879 | StubCodeMark mark(this, "StubRoutines" , "d2i_fixup" ); |
880 | Address inout(rsp, 6 * wordSize); // return address + 5 saves |
881 | |
882 | address start = __ pc(); |
883 | |
884 | Label L; |
885 | |
886 | __ push(rax); |
887 | __ push(c_rarg3); |
888 | __ push(c_rarg2); |
889 | __ push(c_rarg1); |
890 | __ push(c_rarg0); |
891 | |
892 | __ movl(rax, 0x7ff00000); |
893 | __ movq(c_rarg2, inout); |
894 | __ movl(c_rarg3, c_rarg2); |
895 | __ mov(c_rarg1, c_rarg2); |
896 | __ mov(c_rarg0, c_rarg2); |
897 | __ negl(c_rarg3); |
898 | __ shrptr(c_rarg1, 0x20); |
899 | __ orl(c_rarg3, c_rarg2); |
900 | __ andl(c_rarg1, 0x7fffffff); |
901 | __ xorl(c_rarg2, c_rarg2); |
902 | __ shrl(c_rarg3, 0x1f); |
903 | __ orl(c_rarg1, c_rarg3); |
904 | __ cmpl(rax, c_rarg1); |
905 | __ jcc(Assembler::negative, L); // NaN -> 0 |
906 | __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint |
907 | __ movl(c_rarg2, 0x80000000); |
908 | __ movl(rax, 0x7fffffff); |
909 | __ cmov(Assembler::positive, c_rarg2, rax); |
910 | |
911 | __ bind(L); |
912 | __ movptr(inout, c_rarg2); |
913 | |
914 | __ pop(c_rarg0); |
915 | __ pop(c_rarg1); |
916 | __ pop(c_rarg2); |
917 | __ pop(c_rarg3); |
918 | __ pop(rax); |
919 | |
920 | __ ret(0); |
921 | |
922 | return start; |
923 | } |
924 | |
925 | address generate_d2l_fixup() { |
926 | StubCodeMark mark(this, "StubRoutines" , "d2l_fixup" ); |
927 | Address inout(rsp, 6 * wordSize); // return address + 5 saves |
928 | |
929 | address start = __ pc(); |
930 | |
931 | Label L; |
932 | |
933 | __ push(rax); |
934 | __ push(c_rarg3); |
935 | __ push(c_rarg2); |
936 | __ push(c_rarg1); |
937 | __ push(c_rarg0); |
938 | |
939 | __ movl(rax, 0x7ff00000); |
940 | __ movq(c_rarg2, inout); |
941 | __ movl(c_rarg3, c_rarg2); |
942 | __ mov(c_rarg1, c_rarg2); |
943 | __ mov(c_rarg0, c_rarg2); |
944 | __ negl(c_rarg3); |
945 | __ shrptr(c_rarg1, 0x20); |
946 | __ orl(c_rarg3, c_rarg2); |
947 | __ andl(c_rarg1, 0x7fffffff); |
948 | __ xorl(c_rarg2, c_rarg2); |
949 | __ shrl(c_rarg3, 0x1f); |
950 | __ orl(c_rarg1, c_rarg3); |
951 | __ cmpl(rax, c_rarg1); |
952 | __ jcc(Assembler::negative, L); // NaN -> 0 |
953 | __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong |
954 | __ mov64(c_rarg2, 0x8000000000000000); |
955 | __ mov64(rax, 0x7fffffffffffffff); |
956 | __ cmovq(Assembler::positive, c_rarg2, rax); |
957 | |
958 | __ bind(L); |
959 | __ movq(inout, c_rarg2); |
960 | |
961 | __ pop(c_rarg0); |
962 | __ pop(c_rarg1); |
963 | __ pop(c_rarg2); |
964 | __ pop(c_rarg3); |
965 | __ pop(rax); |
966 | |
967 | __ ret(0); |
968 | |
969 | return start; |
970 | } |
971 | |
972 | address generate_fp_mask(const char *stub_name, int64_t mask) { |
973 | __ align(CodeEntryAlignment); |
974 | StubCodeMark mark(this, "StubRoutines" , stub_name); |
975 | address start = __ pc(); |
976 | |
977 | __ emit_data64( mask, relocInfo::none ); |
978 | __ emit_data64( mask, relocInfo::none ); |
979 | |
980 | return start; |
981 | } |
982 | |
983 | address generate_vector_mask(const char *stub_name, int64_t mask) { |
984 | __ align(CodeEntryAlignment); |
985 | StubCodeMark mark(this, "StubRoutines" , stub_name); |
986 | address start = __ pc(); |
987 | |
988 | __ emit_data64(mask, relocInfo::none); |
989 | __ emit_data64(mask, relocInfo::none); |
990 | __ emit_data64(mask, relocInfo::none); |
991 | __ emit_data64(mask, relocInfo::none); |
992 | __ emit_data64(mask, relocInfo::none); |
993 | __ emit_data64(mask, relocInfo::none); |
994 | __ emit_data64(mask, relocInfo::none); |
995 | __ emit_data64(mask, relocInfo::none); |
996 | |
997 | return start; |
998 | } |
999 | |
1000 | address generate_vector_byte_perm_mask(const char *stub_name) { |
1001 | __ align(CodeEntryAlignment); |
1002 | StubCodeMark mark(this, "StubRoutines" , stub_name); |
1003 | address start = __ pc(); |
1004 | |
1005 | __ emit_data64(0x0000000000000001, relocInfo::none); |
1006 | __ emit_data64(0x0000000000000003, relocInfo::none); |
1007 | __ emit_data64(0x0000000000000005, relocInfo::none); |
1008 | __ emit_data64(0x0000000000000007, relocInfo::none); |
1009 | __ emit_data64(0x0000000000000000, relocInfo::none); |
1010 | __ emit_data64(0x0000000000000002, relocInfo::none); |
1011 | __ emit_data64(0x0000000000000004, relocInfo::none); |
1012 | __ emit_data64(0x0000000000000006, relocInfo::none); |
1013 | |
1014 | return start; |
1015 | } |
1016 | |
1017 | // Non-destructive plausibility checks for oops |
1018 | // |
1019 | // Arguments: |
1020 | // all args on stack! |
1021 | // |
1022 | // Stack after saving c_rarg3: |
1023 | // [tos + 0]: saved c_rarg3 |
1024 | // [tos + 1]: saved c_rarg2 |
1025 | // [tos + 2]: saved r12 (several TemplateTable methods use it) |
1026 | // [tos + 3]: saved flags |
1027 | // [tos + 4]: return address |
1028 | // * [tos + 5]: error message (char*) |
1029 | // * [tos + 6]: object to verify (oop) |
1030 | // * [tos + 7]: saved rax - saved by caller and bashed |
1031 | // * [tos + 8]: saved r10 (rscratch1) - saved by caller |
1032 | // * = popped on exit |
1033 | address generate_verify_oop() { |
1034 | StubCodeMark mark(this, "StubRoutines" , "verify_oop" ); |
1035 | address start = __ pc(); |
1036 | |
1037 | Label exit, error; |
1038 | |
1039 | __ pushf(); |
1040 | __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); |
1041 | |
1042 | __ push(r12); |
1043 | |
1044 | // save c_rarg2 and c_rarg3 |
1045 | __ push(c_rarg2); |
1046 | __ push(c_rarg3); |
1047 | |
1048 | enum { |
1049 | // After previous pushes. |
1050 | oop_to_verify = 6 * wordSize, |
1051 | saved_rax = 7 * wordSize, |
1052 | saved_r10 = 8 * wordSize, |
1053 | |
1054 | // Before the call to MacroAssembler::debug(), see below. |
1055 | return_addr = 16 * wordSize, |
1056 | error_msg = 17 * wordSize |
1057 | }; |
1058 | |
1059 | // get object |
1060 | __ movptr(rax, Address(rsp, oop_to_verify)); |
1061 | |
1062 | // make sure object is 'reasonable' |
1063 | __ testptr(rax, rax); |
1064 | __ jcc(Assembler::zero, exit); // if obj is NULL it is OK |
1065 | |
1066 | #if INCLUDE_ZGC |
1067 | if (UseZGC) { |
1068 | // Check if metadata bits indicate a bad oop |
1069 | __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset())); |
1070 | __ jcc(Assembler::notZero, error); |
1071 | } |
1072 | #endif |
1073 | |
1074 | // Check if the oop is in the right area of memory |
1075 | __ movptr(c_rarg2, rax); |
1076 | __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask()); |
1077 | __ andptr(c_rarg2, c_rarg3); |
1078 | __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits()); |
1079 | __ cmpptr(c_rarg2, c_rarg3); |
1080 | __ jcc(Assembler::notZero, error); |
1081 | |
1082 | // set r12 to heapbase for load_klass() |
1083 | __ reinit_heapbase(); |
1084 | |
1085 | // make sure klass is 'reasonable', which is not zero. |
1086 | __ load_klass(rax, rax); // get klass |
1087 | __ testptr(rax, rax); |
1088 | __ jcc(Assembler::zero, error); // if klass is NULL it is broken |
1089 | |
1090 | // return if everything seems ok |
1091 | __ bind(exit); |
1092 | __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back |
1093 | __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back |
1094 | __ pop(c_rarg3); // restore c_rarg3 |
1095 | __ pop(c_rarg2); // restore c_rarg2 |
1096 | __ pop(r12); // restore r12 |
1097 | __ popf(); // restore flags |
1098 | __ ret(4 * wordSize); // pop caller saved stuff |
1099 | |
1100 | // handle errors |
1101 | __ bind(error); |
1102 | __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back |
1103 | __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back |
1104 | __ pop(c_rarg3); // get saved c_rarg3 back |
1105 | __ pop(c_rarg2); // get saved c_rarg2 back |
1106 | __ pop(r12); // get saved r12 back |
1107 | __ popf(); // get saved flags off stack -- |
1108 | // will be ignored |
1109 | |
1110 | __ pusha(); // push registers |
1111 | // (rip is already |
1112 | // already pushed) |
1113 | // debug(char* msg, int64_t pc, int64_t regs[]) |
1114 | // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and |
1115 | // pushed all the registers, so now the stack looks like: |
1116 | // [tos + 0] 16 saved registers |
1117 | // [tos + 16] return address |
1118 | // * [tos + 17] error message (char*) |
1119 | // * [tos + 18] object to verify (oop) |
1120 | // * [tos + 19] saved rax - saved by caller and bashed |
1121 | // * [tos + 20] saved r10 (rscratch1) - saved by caller |
1122 | // * = popped on exit |
1123 | |
1124 | __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message |
1125 | __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address |
1126 | __ movq(c_rarg2, rsp); // pass address of regs on stack |
1127 | __ mov(r12, rsp); // remember rsp |
1128 | __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows |
1129 | __ andptr(rsp, -16); // align stack as required by ABI |
1130 | BLOCK_COMMENT("call MacroAssembler::debug" ); |
1131 | __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); |
1132 | __ mov(rsp, r12); // restore rsp |
1133 | __ popa(); // pop registers (includes r12) |
1134 | __ ret(4 * wordSize); // pop caller saved stuff |
1135 | |
1136 | return start; |
1137 | } |
1138 | |
1139 | // |
1140 | // Verify that a register contains clean 32-bits positive value |
1141 | // (high 32-bits are 0) so it could be used in 64-bits shifts. |
1142 | // |
1143 | // Input: |
1144 | // Rint - 32-bits value |
1145 | // Rtmp - scratch |
1146 | // |
1147 | void assert_clean_int(Register Rint, Register Rtmp) { |
1148 | #ifdef ASSERT |
1149 | Label L; |
1150 | assert_different_registers(Rtmp, Rint); |
1151 | __ movslq(Rtmp, Rint); |
1152 | __ cmpq(Rtmp, Rint); |
1153 | __ jcc(Assembler::equal, L); |
1154 | __ stop("high 32-bits of int value are not 0" ); |
1155 | __ bind(L); |
1156 | #endif |
1157 | } |
1158 | |
1159 | // Generate overlap test for array copy stubs |
1160 | // |
1161 | // Input: |
1162 | // c_rarg0 - from |
1163 | // c_rarg1 - to |
1164 | // c_rarg2 - element count |
1165 | // |
1166 | // Output: |
1167 | // rax - &from[element count - 1] |
1168 | // |
1169 | void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) { |
1170 | assert(no_overlap_target != NULL, "must be generated" ); |
1171 | array_overlap_test(no_overlap_target, NULL, sf); |
1172 | } |
1173 | void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) { |
1174 | array_overlap_test(NULL, &L_no_overlap, sf); |
1175 | } |
1176 | void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { |
1177 | const Register from = c_rarg0; |
1178 | const Register to = c_rarg1; |
1179 | const Register count = c_rarg2; |
1180 | const Register end_from = rax; |
1181 | |
1182 | __ cmpptr(to, from); |
1183 | __ lea(end_from, Address(from, count, sf, 0)); |
1184 | if (NOLp == NULL) { |
1185 | ExternalAddress no_overlap(no_overlap_target); |
1186 | __ jump_cc(Assembler::belowEqual, no_overlap); |
1187 | __ cmpptr(to, end_from); |
1188 | __ jump_cc(Assembler::aboveEqual, no_overlap); |
1189 | } else { |
1190 | __ jcc(Assembler::belowEqual, (*NOLp)); |
1191 | __ cmpptr(to, end_from); |
1192 | __ jcc(Assembler::aboveEqual, (*NOLp)); |
1193 | } |
1194 | } |
1195 | |
1196 | // Shuffle first three arg regs on Windows into Linux/Solaris locations. |
1197 | // |
1198 | // Outputs: |
1199 | // rdi - rcx |
1200 | // rsi - rdx |
1201 | // rdx - r8 |
1202 | // rcx - r9 |
1203 | // |
1204 | // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter |
1205 | // are non-volatile. r9 and r10 should not be used by the caller. |
1206 | // |
1207 | DEBUG_ONLY(bool regs_in_thread;) |
1208 | |
1209 | void setup_arg_regs(int nargs = 3) { |
1210 | const Register saved_rdi = r9; |
1211 | const Register saved_rsi = r10; |
1212 | assert(nargs == 3 || nargs == 4, "else fix" ); |
1213 | #ifdef _WIN64 |
1214 | assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, |
1215 | "unexpected argument registers" ); |
1216 | if (nargs >= 4) |
1217 | __ mov(rax, r9); // r9 is also saved_rdi |
1218 | __ movptr(saved_rdi, rdi); |
1219 | __ movptr(saved_rsi, rsi); |
1220 | __ mov(rdi, rcx); // c_rarg0 |
1221 | __ mov(rsi, rdx); // c_rarg1 |
1222 | __ mov(rdx, r8); // c_rarg2 |
1223 | if (nargs >= 4) |
1224 | __ mov(rcx, rax); // c_rarg3 (via rax) |
1225 | #else |
1226 | assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, |
1227 | "unexpected argument registers" ); |
1228 | #endif |
1229 | DEBUG_ONLY(regs_in_thread = false;) |
1230 | } |
1231 | |
1232 | void restore_arg_regs() { |
1233 | assert(!regs_in_thread, "wrong call to restore_arg_regs" ); |
1234 | const Register saved_rdi = r9; |
1235 | const Register saved_rsi = r10; |
1236 | #ifdef _WIN64 |
1237 | __ movptr(rdi, saved_rdi); |
1238 | __ movptr(rsi, saved_rsi); |
1239 | #endif |
1240 | } |
1241 | |
1242 | // This is used in places where r10 is a scratch register, and can |
1243 | // be adapted if r9 is needed also. |
1244 | void setup_arg_regs_using_thread() { |
1245 | const Register saved_r15 = r9; |
1246 | #ifdef _WIN64 |
1247 | __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored |
1248 | __ get_thread(r15_thread); |
1249 | assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, |
1250 | "unexpected argument registers" ); |
1251 | __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi); |
1252 | __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi); |
1253 | |
1254 | __ mov(rdi, rcx); // c_rarg0 |
1255 | __ mov(rsi, rdx); // c_rarg1 |
1256 | __ mov(rdx, r8); // c_rarg2 |
1257 | #else |
1258 | assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, |
1259 | "unexpected argument registers" ); |
1260 | #endif |
1261 | DEBUG_ONLY(regs_in_thread = true;) |
1262 | } |
1263 | |
1264 | void restore_arg_regs_using_thread() { |
1265 | assert(regs_in_thread, "wrong call to restore_arg_regs" ); |
1266 | const Register saved_r15 = r9; |
1267 | #ifdef _WIN64 |
1268 | __ get_thread(r15_thread); |
1269 | __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset()))); |
1270 | __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset()))); |
1271 | __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored |
1272 | #endif |
1273 | } |
1274 | |
1275 | // Copy big chunks forward |
1276 | // |
1277 | // Inputs: |
1278 | // end_from - source arrays end address |
1279 | // end_to - destination array end address |
1280 | // qword_count - 64-bits element count, negative |
1281 | // to - scratch |
1282 | // L_copy_bytes - entry label |
1283 | // L_copy_8_bytes - exit label |
1284 | // |
1285 | void copy_bytes_forward(Register end_from, Register end_to, |
1286 | Register qword_count, Register to, |
1287 | Label& L_copy_bytes, Label& L_copy_8_bytes) { |
1288 | DEBUG_ONLY(__ stop("enter at entry label, not here" )); |
1289 | Label L_loop; |
1290 | __ align(OptoLoopAlignment); |
1291 | if (UseUnalignedLoadStores) { |
1292 | Label L_end; |
1293 | // Copy 64-bytes per iteration |
1294 | __ BIND(L_loop); |
1295 | if (UseAVX > 2) { |
1296 | __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); |
1297 | __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); |
1298 | } else if (UseAVX == 2) { |
1299 | __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1300 | __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1301 | __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); |
1302 | __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); |
1303 | } else { |
1304 | __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1305 | __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1306 | __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); |
1307 | __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); |
1308 | __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); |
1309 | __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); |
1310 | __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); |
1311 | __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); |
1312 | } |
1313 | __ BIND(L_copy_bytes); |
1314 | __ addptr(qword_count, 8); |
1315 | __ jcc(Assembler::lessEqual, L_loop); |
1316 | __ subptr(qword_count, 4); // sub(8) and add(4) |
1317 | __ jccb(Assembler::greater, L_end); |
1318 | // Copy trailing 32 bytes |
1319 | if (UseAVX >= 2) { |
1320 | __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); |
1321 | __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); |
1322 | } else { |
1323 | __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); |
1324 | __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); |
1325 | __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); |
1326 | __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); |
1327 | } |
1328 | __ addptr(qword_count, 4); |
1329 | __ BIND(L_end); |
1330 | if (UseAVX >= 2) { |
1331 | // clean upper bits of YMM registers |
1332 | __ vpxor(xmm0, xmm0); |
1333 | __ vpxor(xmm1, xmm1); |
1334 | } |
1335 | } else { |
1336 | // Copy 32-bytes per iteration |
1337 | __ BIND(L_loop); |
1338 | __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); |
1339 | __ movq(Address(end_to, qword_count, Address::times_8, -24), to); |
1340 | __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); |
1341 | __ movq(Address(end_to, qword_count, Address::times_8, -16), to); |
1342 | __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); |
1343 | __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); |
1344 | __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); |
1345 | __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); |
1346 | |
1347 | __ BIND(L_copy_bytes); |
1348 | __ addptr(qword_count, 4); |
1349 | __ jcc(Assembler::lessEqual, L_loop); |
1350 | } |
1351 | __ subptr(qword_count, 4); |
1352 | __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords |
1353 | } |
1354 | |
1355 | // Copy big chunks backward |
1356 | // |
1357 | // Inputs: |
1358 | // from - source arrays address |
1359 | // dest - destination array address |
1360 | // qword_count - 64-bits element count |
1361 | // to - scratch |
1362 | // L_copy_bytes - entry label |
1363 | // L_copy_8_bytes - exit label |
1364 | // |
1365 | void copy_bytes_backward(Register from, Register dest, |
1366 | Register qword_count, Register to, |
1367 | Label& L_copy_bytes, Label& L_copy_8_bytes) { |
1368 | DEBUG_ONLY(__ stop("enter at entry label, not here" )); |
1369 | Label L_loop; |
1370 | __ align(OptoLoopAlignment); |
1371 | if (UseUnalignedLoadStores) { |
1372 | Label L_end; |
1373 | // Copy 64-bytes per iteration |
1374 | __ BIND(L_loop); |
1375 | if (UseAVX > 2) { |
1376 | __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); |
1377 | __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); |
1378 | } else if (UseAVX == 2) { |
1379 | __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); |
1380 | __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); |
1381 | __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); |
1382 | __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); |
1383 | } else { |
1384 | __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); |
1385 | __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); |
1386 | __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); |
1387 | __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); |
1388 | __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); |
1389 | __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); |
1390 | __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); |
1391 | __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); |
1392 | } |
1393 | __ BIND(L_copy_bytes); |
1394 | __ subptr(qword_count, 8); |
1395 | __ jcc(Assembler::greaterEqual, L_loop); |
1396 | |
1397 | __ addptr(qword_count, 4); // add(8) and sub(4) |
1398 | __ jccb(Assembler::less, L_end); |
1399 | // Copy trailing 32 bytes |
1400 | if (UseAVX >= 2) { |
1401 | __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); |
1402 | __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); |
1403 | } else { |
1404 | __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); |
1405 | __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); |
1406 | __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); |
1407 | __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); |
1408 | } |
1409 | __ subptr(qword_count, 4); |
1410 | __ BIND(L_end); |
1411 | if (UseAVX >= 2) { |
1412 | // clean upper bits of YMM registers |
1413 | __ vpxor(xmm0, xmm0); |
1414 | __ vpxor(xmm1, xmm1); |
1415 | } |
1416 | } else { |
1417 | // Copy 32-bytes per iteration |
1418 | __ BIND(L_loop); |
1419 | __ movq(to, Address(from, qword_count, Address::times_8, 24)); |
1420 | __ movq(Address(dest, qword_count, Address::times_8, 24), to); |
1421 | __ movq(to, Address(from, qword_count, Address::times_8, 16)); |
1422 | __ movq(Address(dest, qword_count, Address::times_8, 16), to); |
1423 | __ movq(to, Address(from, qword_count, Address::times_8, 8)); |
1424 | __ movq(Address(dest, qword_count, Address::times_8, 8), to); |
1425 | __ movq(to, Address(from, qword_count, Address::times_8, 0)); |
1426 | __ movq(Address(dest, qword_count, Address::times_8, 0), to); |
1427 | |
1428 | __ BIND(L_copy_bytes); |
1429 | __ subptr(qword_count, 4); |
1430 | __ jcc(Assembler::greaterEqual, L_loop); |
1431 | } |
1432 | __ addptr(qword_count, 4); |
1433 | __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords |
1434 | } |
1435 | |
1436 | |
1437 | // Arguments: |
1438 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1439 | // ignored |
1440 | // name - stub name string |
1441 | // |
1442 | // Inputs: |
1443 | // c_rarg0 - source array address |
1444 | // c_rarg1 - destination array address |
1445 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1446 | // |
1447 | // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
1448 | // we let the hardware handle it. The one to eight bytes within words, |
1449 | // dwords or qwords that span cache line boundaries will still be loaded |
1450 | // and stored atomically. |
1451 | // |
1452 | // Side Effects: |
1453 | // disjoint_byte_copy_entry is set to the no-overlap entry point |
1454 | // used by generate_conjoint_byte_copy(). |
1455 | // |
1456 | address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { |
1457 | __ align(CodeEntryAlignment); |
1458 | StubCodeMark mark(this, "StubRoutines" , name); |
1459 | address start = __ pc(); |
1460 | |
1461 | Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; |
1462 | Label L_copy_byte, L_exit; |
1463 | const Register from = rdi; // source array address |
1464 | const Register to = rsi; // destination array address |
1465 | const Register count = rdx; // elements count |
1466 | const Register byte_count = rcx; |
1467 | const Register qword_count = count; |
1468 | const Register end_from = from; // source array end address |
1469 | const Register end_to = to; // destination array end address |
1470 | // End pointers are inclusive, and if count is not zero they point |
1471 | // to the last unit copied: end_to[0] := end_from[0] |
1472 | |
1473 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1474 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1475 | |
1476 | if (entry != NULL) { |
1477 | *entry = __ pc(); |
1478 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1479 | BLOCK_COMMENT("Entry:" ); |
1480 | } |
1481 | |
1482 | setup_arg_regs(); // from => rdi, to => rsi, count => rdx |
1483 | // r9 and r10 may be used to save non-volatile registers |
1484 | |
1485 | // 'from', 'to' and 'count' are now valid |
1486 | __ movptr(byte_count, count); |
1487 | __ shrptr(count, 3); // count => qword_count |
1488 | |
1489 | // Copy from low to high addresses. Use 'to' as scratch. |
1490 | __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); |
1491 | __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); |
1492 | __ negptr(qword_count); // make the count negative |
1493 | __ jmp(L_copy_bytes); |
1494 | |
1495 | // Copy trailing qwords |
1496 | __ BIND(L_copy_8_bytes); |
1497 | __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); |
1498 | __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); |
1499 | __ increment(qword_count); |
1500 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
1501 | |
1502 | // Check for and copy trailing dword |
1503 | __ BIND(L_copy_4_bytes); |
1504 | __ testl(byte_count, 4); |
1505 | __ jccb(Assembler::zero, L_copy_2_bytes); |
1506 | __ movl(rax, Address(end_from, 8)); |
1507 | __ movl(Address(end_to, 8), rax); |
1508 | |
1509 | __ addptr(end_from, 4); |
1510 | __ addptr(end_to, 4); |
1511 | |
1512 | // Check for and copy trailing word |
1513 | __ BIND(L_copy_2_bytes); |
1514 | __ testl(byte_count, 2); |
1515 | __ jccb(Assembler::zero, L_copy_byte); |
1516 | __ movw(rax, Address(end_from, 8)); |
1517 | __ movw(Address(end_to, 8), rax); |
1518 | |
1519 | __ addptr(end_from, 2); |
1520 | __ addptr(end_to, 2); |
1521 | |
1522 | // Check for and copy trailing byte |
1523 | __ BIND(L_copy_byte); |
1524 | __ testl(byte_count, 1); |
1525 | __ jccb(Assembler::zero, L_exit); |
1526 | __ movb(rax, Address(end_from, 8)); |
1527 | __ movb(Address(end_to, 8), rax); |
1528 | |
1529 | __ BIND(L_exit); |
1530 | restore_arg_regs(); |
1531 | inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free |
1532 | __ xorptr(rax, rax); // return 0 |
1533 | __ vzeroupper(); |
1534 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1535 | __ ret(0); |
1536 | |
1537 | // Copy in multi-bytes chunks |
1538 | copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
1539 | __ jmp(L_copy_4_bytes); |
1540 | |
1541 | return start; |
1542 | } |
1543 | |
1544 | // Arguments: |
1545 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1546 | // ignored |
1547 | // name - stub name string |
1548 | // |
1549 | // Inputs: |
1550 | // c_rarg0 - source array address |
1551 | // c_rarg1 - destination array address |
1552 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1553 | // |
1554 | // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
1555 | // we let the hardware handle it. The one to eight bytes within words, |
1556 | // dwords or qwords that span cache line boundaries will still be loaded |
1557 | // and stored atomically. |
1558 | // |
1559 | address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, |
1560 | address* entry, const char *name) { |
1561 | __ align(CodeEntryAlignment); |
1562 | StubCodeMark mark(this, "StubRoutines" , name); |
1563 | address start = __ pc(); |
1564 | |
1565 | Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; |
1566 | const Register from = rdi; // source array address |
1567 | const Register to = rsi; // destination array address |
1568 | const Register count = rdx; // elements count |
1569 | const Register byte_count = rcx; |
1570 | const Register qword_count = count; |
1571 | |
1572 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1573 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1574 | |
1575 | if (entry != NULL) { |
1576 | *entry = __ pc(); |
1577 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1578 | BLOCK_COMMENT("Entry:" ); |
1579 | } |
1580 | |
1581 | array_overlap_test(nooverlap_target, Address::times_1); |
1582 | setup_arg_regs(); // from => rdi, to => rsi, count => rdx |
1583 | // r9 and r10 may be used to save non-volatile registers |
1584 | |
1585 | // 'from', 'to' and 'count' are now valid |
1586 | __ movptr(byte_count, count); |
1587 | __ shrptr(count, 3); // count => qword_count |
1588 | |
1589 | // Copy from high to low addresses. |
1590 | |
1591 | // Check for and copy trailing byte |
1592 | __ testl(byte_count, 1); |
1593 | __ jcc(Assembler::zero, L_copy_2_bytes); |
1594 | __ movb(rax, Address(from, byte_count, Address::times_1, -1)); |
1595 | __ movb(Address(to, byte_count, Address::times_1, -1), rax); |
1596 | __ decrement(byte_count); // Adjust for possible trailing word |
1597 | |
1598 | // Check for and copy trailing word |
1599 | __ BIND(L_copy_2_bytes); |
1600 | __ testl(byte_count, 2); |
1601 | __ jcc(Assembler::zero, L_copy_4_bytes); |
1602 | __ movw(rax, Address(from, byte_count, Address::times_1, -2)); |
1603 | __ movw(Address(to, byte_count, Address::times_1, -2), rax); |
1604 | |
1605 | // Check for and copy trailing dword |
1606 | __ BIND(L_copy_4_bytes); |
1607 | __ testl(byte_count, 4); |
1608 | __ jcc(Assembler::zero, L_copy_bytes); |
1609 | __ movl(rax, Address(from, qword_count, Address::times_8)); |
1610 | __ movl(Address(to, qword_count, Address::times_8), rax); |
1611 | __ jmp(L_copy_bytes); |
1612 | |
1613 | // Copy trailing qwords |
1614 | __ BIND(L_copy_8_bytes); |
1615 | __ movq(rax, Address(from, qword_count, Address::times_8, -8)); |
1616 | __ movq(Address(to, qword_count, Address::times_8, -8), rax); |
1617 | __ decrement(qword_count); |
1618 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
1619 | |
1620 | restore_arg_regs(); |
1621 | inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free |
1622 | __ xorptr(rax, rax); // return 0 |
1623 | __ vzeroupper(); |
1624 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1625 | __ ret(0); |
1626 | |
1627 | // Copy in multi-bytes chunks |
1628 | copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
1629 | |
1630 | restore_arg_regs(); |
1631 | inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free |
1632 | __ xorptr(rax, rax); // return 0 |
1633 | __ vzeroupper(); |
1634 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1635 | __ ret(0); |
1636 | |
1637 | return start; |
1638 | } |
1639 | |
1640 | // Arguments: |
1641 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1642 | // ignored |
1643 | // name - stub name string |
1644 | // |
1645 | // Inputs: |
1646 | // c_rarg0 - source array address |
1647 | // c_rarg1 - destination array address |
1648 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1649 | // |
1650 | // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we |
1651 | // let the hardware handle it. The two or four words within dwords |
1652 | // or qwords that span cache line boundaries will still be loaded |
1653 | // and stored atomically. |
1654 | // |
1655 | // Side Effects: |
1656 | // disjoint_short_copy_entry is set to the no-overlap entry point |
1657 | // used by generate_conjoint_short_copy(). |
1658 | // |
1659 | address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { |
1660 | __ align(CodeEntryAlignment); |
1661 | StubCodeMark mark(this, "StubRoutines" , name); |
1662 | address start = __ pc(); |
1663 | |
1664 | Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; |
1665 | const Register from = rdi; // source array address |
1666 | const Register to = rsi; // destination array address |
1667 | const Register count = rdx; // elements count |
1668 | const Register word_count = rcx; |
1669 | const Register qword_count = count; |
1670 | const Register end_from = from; // source array end address |
1671 | const Register end_to = to; // destination array end address |
1672 | // End pointers are inclusive, and if count is not zero they point |
1673 | // to the last unit copied: end_to[0] := end_from[0] |
1674 | |
1675 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1676 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1677 | |
1678 | if (entry != NULL) { |
1679 | *entry = __ pc(); |
1680 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1681 | BLOCK_COMMENT("Entry:" ); |
1682 | } |
1683 | |
1684 | setup_arg_regs(); // from => rdi, to => rsi, count => rdx |
1685 | // r9 and r10 may be used to save non-volatile registers |
1686 | |
1687 | // 'from', 'to' and 'count' are now valid |
1688 | __ movptr(word_count, count); |
1689 | __ shrptr(count, 2); // count => qword_count |
1690 | |
1691 | // Copy from low to high addresses. Use 'to' as scratch. |
1692 | __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); |
1693 | __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); |
1694 | __ negptr(qword_count); |
1695 | __ jmp(L_copy_bytes); |
1696 | |
1697 | // Copy trailing qwords |
1698 | __ BIND(L_copy_8_bytes); |
1699 | __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); |
1700 | __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); |
1701 | __ increment(qword_count); |
1702 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
1703 | |
1704 | // Original 'dest' is trashed, so we can't use it as a |
1705 | // base register for a possible trailing word copy |
1706 | |
1707 | // Check for and copy trailing dword |
1708 | __ BIND(L_copy_4_bytes); |
1709 | __ testl(word_count, 2); |
1710 | __ jccb(Assembler::zero, L_copy_2_bytes); |
1711 | __ movl(rax, Address(end_from, 8)); |
1712 | __ movl(Address(end_to, 8), rax); |
1713 | |
1714 | __ addptr(end_from, 4); |
1715 | __ addptr(end_to, 4); |
1716 | |
1717 | // Check for and copy trailing word |
1718 | __ BIND(L_copy_2_bytes); |
1719 | __ testl(word_count, 1); |
1720 | __ jccb(Assembler::zero, L_exit); |
1721 | __ movw(rax, Address(end_from, 8)); |
1722 | __ movw(Address(end_to, 8), rax); |
1723 | |
1724 | __ BIND(L_exit); |
1725 | restore_arg_regs(); |
1726 | inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free |
1727 | __ xorptr(rax, rax); // return 0 |
1728 | __ vzeroupper(); |
1729 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1730 | __ ret(0); |
1731 | |
1732 | // Copy in multi-bytes chunks |
1733 | copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
1734 | __ jmp(L_copy_4_bytes); |
1735 | |
1736 | return start; |
1737 | } |
1738 | |
1739 | address generate_fill(BasicType t, bool aligned, const char *name) { |
1740 | __ align(CodeEntryAlignment); |
1741 | StubCodeMark mark(this, "StubRoutines" , name); |
1742 | address start = __ pc(); |
1743 | |
1744 | BLOCK_COMMENT("Entry:" ); |
1745 | |
1746 | const Register to = c_rarg0; // source array address |
1747 | const Register value = c_rarg1; // value |
1748 | const Register count = c_rarg2; // elements count |
1749 | |
1750 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1751 | |
1752 | __ generate_fill(t, aligned, to, value, count, rax, xmm0); |
1753 | |
1754 | __ vzeroupper(); |
1755 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1756 | __ ret(0); |
1757 | return start; |
1758 | } |
1759 | |
1760 | // Arguments: |
1761 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1762 | // ignored |
1763 | // name - stub name string |
1764 | // |
1765 | // Inputs: |
1766 | // c_rarg0 - source array address |
1767 | // c_rarg1 - destination array address |
1768 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1769 | // |
1770 | // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we |
1771 | // let the hardware handle it. The two or four words within dwords |
1772 | // or qwords that span cache line boundaries will still be loaded |
1773 | // and stored atomically. |
1774 | // |
1775 | address generate_conjoint_short_copy(bool aligned, address nooverlap_target, |
1776 | address *entry, const char *name) { |
1777 | __ align(CodeEntryAlignment); |
1778 | StubCodeMark mark(this, "StubRoutines" , name); |
1779 | address start = __ pc(); |
1780 | |
1781 | Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; |
1782 | const Register from = rdi; // source array address |
1783 | const Register to = rsi; // destination array address |
1784 | const Register count = rdx; // elements count |
1785 | const Register word_count = rcx; |
1786 | const Register qword_count = count; |
1787 | |
1788 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1789 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1790 | |
1791 | if (entry != NULL) { |
1792 | *entry = __ pc(); |
1793 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1794 | BLOCK_COMMENT("Entry:" ); |
1795 | } |
1796 | |
1797 | array_overlap_test(nooverlap_target, Address::times_2); |
1798 | setup_arg_regs(); // from => rdi, to => rsi, count => rdx |
1799 | // r9 and r10 may be used to save non-volatile registers |
1800 | |
1801 | // 'from', 'to' and 'count' are now valid |
1802 | __ movptr(word_count, count); |
1803 | __ shrptr(count, 2); // count => qword_count |
1804 | |
1805 | // Copy from high to low addresses. Use 'to' as scratch. |
1806 | |
1807 | // Check for and copy trailing word |
1808 | __ testl(word_count, 1); |
1809 | __ jccb(Assembler::zero, L_copy_4_bytes); |
1810 | __ movw(rax, Address(from, word_count, Address::times_2, -2)); |
1811 | __ movw(Address(to, word_count, Address::times_2, -2), rax); |
1812 | |
1813 | // Check for and copy trailing dword |
1814 | __ BIND(L_copy_4_bytes); |
1815 | __ testl(word_count, 2); |
1816 | __ jcc(Assembler::zero, L_copy_bytes); |
1817 | __ movl(rax, Address(from, qword_count, Address::times_8)); |
1818 | __ movl(Address(to, qword_count, Address::times_8), rax); |
1819 | __ jmp(L_copy_bytes); |
1820 | |
1821 | // Copy trailing qwords |
1822 | __ BIND(L_copy_8_bytes); |
1823 | __ movq(rax, Address(from, qword_count, Address::times_8, -8)); |
1824 | __ movq(Address(to, qword_count, Address::times_8, -8), rax); |
1825 | __ decrement(qword_count); |
1826 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
1827 | |
1828 | restore_arg_regs(); |
1829 | inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free |
1830 | __ xorptr(rax, rax); // return 0 |
1831 | __ vzeroupper(); |
1832 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1833 | __ ret(0); |
1834 | |
1835 | // Copy in multi-bytes chunks |
1836 | copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
1837 | |
1838 | restore_arg_regs(); |
1839 | inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free |
1840 | __ xorptr(rax, rax); // return 0 |
1841 | __ vzeroupper(); |
1842 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1843 | __ ret(0); |
1844 | |
1845 | return start; |
1846 | } |
1847 | |
1848 | // Arguments: |
1849 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1850 | // ignored |
1851 | // is_oop - true => oop array, so generate store check code |
1852 | // name - stub name string |
1853 | // |
1854 | // Inputs: |
1855 | // c_rarg0 - source array address |
1856 | // c_rarg1 - destination array address |
1857 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1858 | // |
1859 | // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
1860 | // the hardware handle it. The two dwords within qwords that span |
1861 | // cache line boundaries will still be loaded and stored atomicly. |
1862 | // |
1863 | // Side Effects: |
1864 | // disjoint_int_copy_entry is set to the no-overlap entry point |
1865 | // used by generate_conjoint_int_oop_copy(). |
1866 | // |
1867 | address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, |
1868 | const char *name, bool dest_uninitialized = false) { |
1869 | __ align(CodeEntryAlignment); |
1870 | StubCodeMark mark(this, "StubRoutines" , name); |
1871 | address start = __ pc(); |
1872 | |
1873 | Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; |
1874 | const Register from = rdi; // source array address |
1875 | const Register to = rsi; // destination array address |
1876 | const Register count = rdx; // elements count |
1877 | const Register dword_count = rcx; |
1878 | const Register qword_count = count; |
1879 | const Register end_from = from; // source array end address |
1880 | const Register end_to = to; // destination array end address |
1881 | // End pointers are inclusive, and if count is not zero they point |
1882 | // to the last unit copied: end_to[0] := end_from[0] |
1883 | |
1884 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1885 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1886 | |
1887 | if (entry != NULL) { |
1888 | *entry = __ pc(); |
1889 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1890 | BLOCK_COMMENT("Entry:" ); |
1891 | } |
1892 | |
1893 | setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx |
1894 | // r9 is used to save r15_thread |
1895 | |
1896 | DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; |
1897 | if (dest_uninitialized) { |
1898 | decorators |= IS_DEST_UNINITIALIZED; |
1899 | } |
1900 | if (aligned) { |
1901 | decorators |= ARRAYCOPY_ALIGNED; |
1902 | } |
1903 | |
1904 | BasicType type = is_oop ? T_OBJECT : T_INT; |
1905 | BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
1906 | bs->arraycopy_prologue(_masm, decorators, type, from, to, count); |
1907 | |
1908 | // 'from', 'to' and 'count' are now valid |
1909 | __ movptr(dword_count, count); |
1910 | __ shrptr(count, 1); // count => qword_count |
1911 | |
1912 | // Copy from low to high addresses. Use 'to' as scratch. |
1913 | __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); |
1914 | __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); |
1915 | __ negptr(qword_count); |
1916 | __ jmp(L_copy_bytes); |
1917 | |
1918 | // Copy trailing qwords |
1919 | __ BIND(L_copy_8_bytes); |
1920 | __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); |
1921 | __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); |
1922 | __ increment(qword_count); |
1923 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
1924 | |
1925 | // Check for and copy trailing dword |
1926 | __ BIND(L_copy_4_bytes); |
1927 | __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 |
1928 | __ jccb(Assembler::zero, L_exit); |
1929 | __ movl(rax, Address(end_from, 8)); |
1930 | __ movl(Address(end_to, 8), rax); |
1931 | |
1932 | __ BIND(L_exit); |
1933 | bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); |
1934 | restore_arg_regs_using_thread(); |
1935 | inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free |
1936 | __ vzeroupper(); |
1937 | __ xorptr(rax, rax); // return 0 |
1938 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
1939 | __ ret(0); |
1940 | |
1941 | // Copy in multi-bytes chunks |
1942 | copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
1943 | __ jmp(L_copy_4_bytes); |
1944 | |
1945 | return start; |
1946 | } |
1947 | |
1948 | // Arguments: |
1949 | // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
1950 | // ignored |
1951 | // is_oop - true => oop array, so generate store check code |
1952 | // name - stub name string |
1953 | // |
1954 | // Inputs: |
1955 | // c_rarg0 - source array address |
1956 | // c_rarg1 - destination array address |
1957 | // c_rarg2 - element count, treated as ssize_t, can be zero |
1958 | // |
1959 | // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
1960 | // the hardware handle it. The two dwords within qwords that span |
1961 | // cache line boundaries will still be loaded and stored atomicly. |
1962 | // |
1963 | address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, |
1964 | address *entry, const char *name, |
1965 | bool dest_uninitialized = false) { |
1966 | __ align(CodeEntryAlignment); |
1967 | StubCodeMark mark(this, "StubRoutines" , name); |
1968 | address start = __ pc(); |
1969 | |
1970 | Label L_copy_bytes, L_copy_8_bytes, L_exit; |
1971 | const Register from = rdi; // source array address |
1972 | const Register to = rsi; // destination array address |
1973 | const Register count = rdx; // elements count |
1974 | const Register dword_count = rcx; |
1975 | const Register qword_count = count; |
1976 | |
1977 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
1978 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
1979 | |
1980 | if (entry != NULL) { |
1981 | *entry = __ pc(); |
1982 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
1983 | BLOCK_COMMENT("Entry:" ); |
1984 | } |
1985 | |
1986 | array_overlap_test(nooverlap_target, Address::times_4); |
1987 | setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx |
1988 | // r9 is used to save r15_thread |
1989 | |
1990 | DecoratorSet decorators = IN_HEAP | IS_ARRAY; |
1991 | if (dest_uninitialized) { |
1992 | decorators |= IS_DEST_UNINITIALIZED; |
1993 | } |
1994 | if (aligned) { |
1995 | decorators |= ARRAYCOPY_ALIGNED; |
1996 | } |
1997 | |
1998 | BasicType type = is_oop ? T_OBJECT : T_INT; |
1999 | BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2000 | // no registers are destroyed by this call |
2001 | bs->arraycopy_prologue(_masm, decorators, type, from, to, count); |
2002 | |
2003 | assert_clean_int(count, rax); // Make sure 'count' is clean int. |
2004 | // 'from', 'to' and 'count' are now valid |
2005 | __ movptr(dword_count, count); |
2006 | __ shrptr(count, 1); // count => qword_count |
2007 | |
2008 | // Copy from high to low addresses. Use 'to' as scratch. |
2009 | |
2010 | // Check for and copy trailing dword |
2011 | __ testl(dword_count, 1); |
2012 | __ jcc(Assembler::zero, L_copy_bytes); |
2013 | __ movl(rax, Address(from, dword_count, Address::times_4, -4)); |
2014 | __ movl(Address(to, dword_count, Address::times_4, -4), rax); |
2015 | __ jmp(L_copy_bytes); |
2016 | |
2017 | // Copy trailing qwords |
2018 | __ BIND(L_copy_8_bytes); |
2019 | __ movq(rax, Address(from, qword_count, Address::times_8, -8)); |
2020 | __ movq(Address(to, qword_count, Address::times_8, -8), rax); |
2021 | __ decrement(qword_count); |
2022 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
2023 | |
2024 | if (is_oop) { |
2025 | __ jmp(L_exit); |
2026 | } |
2027 | restore_arg_regs_using_thread(); |
2028 | inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free |
2029 | __ xorptr(rax, rax); // return 0 |
2030 | __ vzeroupper(); |
2031 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2032 | __ ret(0); |
2033 | |
2034 | // Copy in multi-bytes chunks |
2035 | copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
2036 | |
2037 | __ BIND(L_exit); |
2038 | bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); |
2039 | restore_arg_regs_using_thread(); |
2040 | inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free |
2041 | __ xorptr(rax, rax); // return 0 |
2042 | __ vzeroupper(); |
2043 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2044 | __ ret(0); |
2045 | |
2046 | return start; |
2047 | } |
2048 | |
2049 | // Arguments: |
2050 | // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
2051 | // ignored |
2052 | // is_oop - true => oop array, so generate store check code |
2053 | // name - stub name string |
2054 | // |
2055 | // Inputs: |
2056 | // c_rarg0 - source array address |
2057 | // c_rarg1 - destination array address |
2058 | // c_rarg2 - element count, treated as ssize_t, can be zero |
2059 | // |
2060 | // Side Effects: |
2061 | // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the |
2062 | // no-overlap entry point used by generate_conjoint_long_oop_copy(). |
2063 | // |
2064 | address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, |
2065 | const char *name, bool dest_uninitialized = false) { |
2066 | __ align(CodeEntryAlignment); |
2067 | StubCodeMark mark(this, "StubRoutines" , name); |
2068 | address start = __ pc(); |
2069 | |
2070 | Label L_copy_bytes, L_copy_8_bytes, L_exit; |
2071 | const Register from = rdi; // source array address |
2072 | const Register to = rsi; // destination array address |
2073 | const Register qword_count = rdx; // elements count |
2074 | const Register end_from = from; // source array end address |
2075 | const Register end_to = rcx; // destination array end address |
2076 | const Register saved_count = r11; |
2077 | // End pointers are inclusive, and if count is not zero they point |
2078 | // to the last unit copied: end_to[0] := end_from[0] |
2079 | |
2080 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
2081 | // Save no-overlap entry point for generate_conjoint_long_oop_copy() |
2082 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
2083 | |
2084 | if (entry != NULL) { |
2085 | *entry = __ pc(); |
2086 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
2087 | BLOCK_COMMENT("Entry:" ); |
2088 | } |
2089 | |
2090 | setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx |
2091 | // r9 is used to save r15_thread |
2092 | // 'from', 'to' and 'qword_count' are now valid |
2093 | |
2094 | DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; |
2095 | if (dest_uninitialized) { |
2096 | decorators |= IS_DEST_UNINITIALIZED; |
2097 | } |
2098 | if (aligned) { |
2099 | decorators |= ARRAYCOPY_ALIGNED; |
2100 | } |
2101 | |
2102 | BasicType type = is_oop ? T_OBJECT : T_LONG; |
2103 | BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2104 | bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); |
2105 | |
2106 | // Copy from low to high addresses. Use 'to' as scratch. |
2107 | __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); |
2108 | __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); |
2109 | __ negptr(qword_count); |
2110 | __ jmp(L_copy_bytes); |
2111 | |
2112 | // Copy trailing qwords |
2113 | __ BIND(L_copy_8_bytes); |
2114 | __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); |
2115 | __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); |
2116 | __ increment(qword_count); |
2117 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
2118 | |
2119 | if (is_oop) { |
2120 | __ jmp(L_exit); |
2121 | } else { |
2122 | restore_arg_regs_using_thread(); |
2123 | inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free |
2124 | __ xorptr(rax, rax); // return 0 |
2125 | __ vzeroupper(); |
2126 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2127 | __ ret(0); |
2128 | } |
2129 | |
2130 | // Copy in multi-bytes chunks |
2131 | copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
2132 | |
2133 | __ BIND(L_exit); |
2134 | bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); |
2135 | restore_arg_regs_using_thread(); |
2136 | if (is_oop) { |
2137 | inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free |
2138 | } else { |
2139 | inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free |
2140 | } |
2141 | __ vzeroupper(); |
2142 | __ xorptr(rax, rax); // return 0 |
2143 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2144 | __ ret(0); |
2145 | |
2146 | return start; |
2147 | } |
2148 | |
2149 | // Arguments: |
2150 | // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
2151 | // ignored |
2152 | // is_oop - true => oop array, so generate store check code |
2153 | // name - stub name string |
2154 | // |
2155 | // Inputs: |
2156 | // c_rarg0 - source array address |
2157 | // c_rarg1 - destination array address |
2158 | // c_rarg2 - element count, treated as ssize_t, can be zero |
2159 | // |
2160 | address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, |
2161 | address nooverlap_target, address *entry, |
2162 | const char *name, bool dest_uninitialized = false) { |
2163 | __ align(CodeEntryAlignment); |
2164 | StubCodeMark mark(this, "StubRoutines" , name); |
2165 | address start = __ pc(); |
2166 | |
2167 | Label L_copy_bytes, L_copy_8_bytes, L_exit; |
2168 | const Register from = rdi; // source array address |
2169 | const Register to = rsi; // destination array address |
2170 | const Register qword_count = rdx; // elements count |
2171 | const Register saved_count = rcx; |
2172 | |
2173 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
2174 | assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. |
2175 | |
2176 | if (entry != NULL) { |
2177 | *entry = __ pc(); |
2178 | // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
2179 | BLOCK_COMMENT("Entry:" ); |
2180 | } |
2181 | |
2182 | array_overlap_test(nooverlap_target, Address::times_8); |
2183 | setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx |
2184 | // r9 is used to save r15_thread |
2185 | // 'from', 'to' and 'qword_count' are now valid |
2186 | |
2187 | DecoratorSet decorators = IN_HEAP | IS_ARRAY; |
2188 | if (dest_uninitialized) { |
2189 | decorators |= IS_DEST_UNINITIALIZED; |
2190 | } |
2191 | if (aligned) { |
2192 | decorators |= ARRAYCOPY_ALIGNED; |
2193 | } |
2194 | |
2195 | BasicType type = is_oop ? T_OBJECT : T_LONG; |
2196 | BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2197 | bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); |
2198 | |
2199 | __ jmp(L_copy_bytes); |
2200 | |
2201 | // Copy trailing qwords |
2202 | __ BIND(L_copy_8_bytes); |
2203 | __ movq(rax, Address(from, qword_count, Address::times_8, -8)); |
2204 | __ movq(Address(to, qword_count, Address::times_8, -8), rax); |
2205 | __ decrement(qword_count); |
2206 | __ jcc(Assembler::notZero, L_copy_8_bytes); |
2207 | |
2208 | if (is_oop) { |
2209 | __ jmp(L_exit); |
2210 | } else { |
2211 | restore_arg_regs_using_thread(); |
2212 | inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free |
2213 | __ xorptr(rax, rax); // return 0 |
2214 | __ vzeroupper(); |
2215 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2216 | __ ret(0); |
2217 | } |
2218 | |
2219 | // Copy in multi-bytes chunks |
2220 | copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); |
2221 | |
2222 | __ BIND(L_exit); |
2223 | bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); |
2224 | restore_arg_regs_using_thread(); |
2225 | if (is_oop) { |
2226 | inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free |
2227 | } else { |
2228 | inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free |
2229 | } |
2230 | __ vzeroupper(); |
2231 | __ xorptr(rax, rax); // return 0 |
2232 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2233 | __ ret(0); |
2234 | |
2235 | return start; |
2236 | } |
2237 | |
2238 | |
2239 | // Helper for generating a dynamic type check. |
2240 | // Smashes no registers. |
2241 | void generate_type_check(Register sub_klass, |
2242 | Register super_check_offset, |
2243 | Register super_klass, |
2244 | Label& L_success) { |
2245 | assert_different_registers(sub_klass, super_check_offset, super_klass); |
2246 | |
2247 | BLOCK_COMMENT("type_check:" ); |
2248 | |
2249 | Label L_miss; |
2250 | |
2251 | __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, |
2252 | super_check_offset); |
2253 | __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); |
2254 | |
2255 | // Fall through on failure! |
2256 | __ BIND(L_miss); |
2257 | } |
2258 | |
2259 | // |
2260 | // Generate checkcasting array copy stub |
2261 | // |
2262 | // Input: |
2263 | // c_rarg0 - source array address |
2264 | // c_rarg1 - destination array address |
2265 | // c_rarg2 - element count, treated as ssize_t, can be zero |
2266 | // c_rarg3 - size_t ckoff (super_check_offset) |
2267 | // not Win64 |
2268 | // c_rarg4 - oop ckval (super_klass) |
2269 | // Win64 |
2270 | // rsp+40 - oop ckval (super_klass) |
2271 | // |
2272 | // Output: |
2273 | // rax == 0 - success |
2274 | // rax == -1^K - failure, where K is partial transfer count |
2275 | // |
2276 | address generate_checkcast_copy(const char *name, address *entry, |
2277 | bool dest_uninitialized = false) { |
2278 | |
2279 | Label L_load_element, L_store_element, L_do_card_marks, L_done; |
2280 | |
2281 | // Input registers (after setup_arg_regs) |
2282 | const Register from = rdi; // source array address |
2283 | const Register to = rsi; // destination array address |
2284 | const Register length = rdx; // elements count |
2285 | const Register ckoff = rcx; // super_check_offset |
2286 | const Register ckval = r8; // super_klass |
2287 | |
2288 | // Registers used as temps (r13, r14 are save-on-entry) |
2289 | const Register end_from = from; // source array end address |
2290 | const Register end_to = r13; // destination array end address |
2291 | const Register count = rdx; // -(count_remaining) |
2292 | const Register r14_length = r14; // saved copy of length |
2293 | // End pointers are inclusive, and if length is not zero they point |
2294 | // to the last unit copied: end_to[0] := end_from[0] |
2295 | |
2296 | const Register rax_oop = rax; // actual oop copied |
2297 | const Register r11_klass = r11; // oop._klass |
2298 | |
2299 | //--------------------------------------------------------------- |
2300 | // Assembler stub will be used for this call to arraycopy |
2301 | // if the two arrays are subtypes of Object[] but the |
2302 | // destination array type is not equal to or a supertype |
2303 | // of the source type. Each element must be separately |
2304 | // checked. |
2305 | |
2306 | __ align(CodeEntryAlignment); |
2307 | StubCodeMark mark(this, "StubRoutines" , name); |
2308 | address start = __ pc(); |
2309 | |
2310 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
2311 | |
2312 | #ifdef ASSERT |
2313 | // caller guarantees that the arrays really are different |
2314 | // otherwise, we would have to make conjoint checks |
2315 | { Label L; |
2316 | array_overlap_test(L, TIMES_OOP); |
2317 | __ stop("checkcast_copy within a single array" ); |
2318 | __ bind(L); |
2319 | } |
2320 | #endif //ASSERT |
2321 | |
2322 | setup_arg_regs(4); // from => rdi, to => rsi, length => rdx |
2323 | // ckoff => rcx, ckval => r8 |
2324 | // r9 and r10 may be used to save non-volatile registers |
2325 | #ifdef _WIN64 |
2326 | // last argument (#4) is on stack on Win64 |
2327 | __ movptr(ckval, Address(rsp, 6 * wordSize)); |
2328 | #endif |
2329 | |
2330 | // Caller of this entry point must set up the argument registers. |
2331 | if (entry != NULL) { |
2332 | *entry = __ pc(); |
2333 | BLOCK_COMMENT("Entry:" ); |
2334 | } |
2335 | |
2336 | // allocate spill slots for r13, r14 |
2337 | enum { |
2338 | saved_r13_offset, |
2339 | saved_r14_offset, |
2340 | saved_r10_offset, |
2341 | saved_rbp_offset |
2342 | }; |
2343 | __ subptr(rsp, saved_rbp_offset * wordSize); |
2344 | __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); |
2345 | __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); |
2346 | __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); |
2347 | |
2348 | #ifdef ASSERT |
2349 | Label L2; |
2350 | __ get_thread(r14); |
2351 | __ cmpptr(r15_thread, r14); |
2352 | __ jcc(Assembler::equal, L2); |
2353 | __ stop("StubRoutines::call_stub: r15_thread is modified by call" ); |
2354 | __ bind(L2); |
2355 | #endif // ASSERT |
2356 | |
2357 | // check that int operands are properly extended to size_t |
2358 | assert_clean_int(length, rax); |
2359 | assert_clean_int(ckoff, rax); |
2360 | |
2361 | #ifdef ASSERT |
2362 | BLOCK_COMMENT("assert consistent ckoff/ckval" ); |
2363 | // The ckoff and ckval must be mutually consistent, |
2364 | // even though caller generates both. |
2365 | { Label L; |
2366 | int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
2367 | __ cmpl(ckoff, Address(ckval, sco_offset)); |
2368 | __ jcc(Assembler::equal, L); |
2369 | __ stop("super_check_offset inconsistent" ); |
2370 | __ bind(L); |
2371 | } |
2372 | #endif //ASSERT |
2373 | |
2374 | // Loop-invariant addresses. They are exclusive end pointers. |
2375 | Address end_from_addr(from, length, TIMES_OOP, 0); |
2376 | Address end_to_addr(to, length, TIMES_OOP, 0); |
2377 | // Loop-variant addresses. They assume post-incremented count < 0. |
2378 | Address from_element_addr(end_from, count, TIMES_OOP, 0); |
2379 | Address to_element_addr(end_to, count, TIMES_OOP, 0); |
2380 | |
2381 | DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; |
2382 | if (dest_uninitialized) { |
2383 | decorators |= IS_DEST_UNINITIALIZED; |
2384 | } |
2385 | |
2386 | BasicType type = T_OBJECT; |
2387 | BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2388 | bs->arraycopy_prologue(_masm, decorators, type, from, to, count); |
2389 | |
2390 | // Copy from low to high addresses, indexed from the end of each array. |
2391 | __ lea(end_from, end_from_addr); |
2392 | __ lea(end_to, end_to_addr); |
2393 | __ movptr(r14_length, length); // save a copy of the length |
2394 | assert(length == count, "" ); // else fix next line: |
2395 | __ negptr(count); // negate and test the length |
2396 | __ jcc(Assembler::notZero, L_load_element); |
2397 | |
2398 | // Empty array: Nothing to do. |
2399 | __ xorptr(rax, rax); // return 0 on (trivial) success |
2400 | __ jmp(L_done); |
2401 | |
2402 | // ======== begin loop ======== |
2403 | // (Loop is rotated; its entry is L_load_element.) |
2404 | // Loop control: |
2405 | // for (count = -count; count != 0; count++) |
2406 | // Base pointers src, dst are biased by 8*(count-1),to last element. |
2407 | __ align(OptoLoopAlignment); |
2408 | |
2409 | __ BIND(L_store_element); |
2410 | __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop |
2411 | __ increment(count); // increment the count toward zero |
2412 | __ jcc(Assembler::zero, L_do_card_marks); |
2413 | |
2414 | // ======== loop entry is here ======== |
2415 | __ BIND(L_load_element); |
2416 | __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop |
2417 | __ testptr(rax_oop, rax_oop); |
2418 | __ jcc(Assembler::zero, L_store_element); |
2419 | |
2420 | __ load_klass(r11_klass, rax_oop);// query the object klass |
2421 | generate_type_check(r11_klass, ckoff, ckval, L_store_element); |
2422 | // ======== end loop ======== |
2423 | |
2424 | // It was a real error; we must depend on the caller to finish the job. |
2425 | // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. |
2426 | // Emit GC store barriers for the oops we have copied (r14 + rdx), |
2427 | // and report their number to the caller. |
2428 | assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); |
2429 | Label L_post_barrier; |
2430 | __ addptr(r14_length, count); // K = (original - remaining) oops |
2431 | __ movptr(rax, r14_length); // save the value |
2432 | __ notptr(rax); // report (-1^K) to caller (does not affect flags) |
2433 | __ jccb(Assembler::notZero, L_post_barrier); |
2434 | __ jmp(L_done); // K == 0, nothing was copied, skip post barrier |
2435 | |
2436 | // Come here on success only. |
2437 | __ BIND(L_do_card_marks); |
2438 | __ xorptr(rax, rax); // return 0 on success |
2439 | |
2440 | __ BIND(L_post_barrier); |
2441 | bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); |
2442 | |
2443 | // Common exit point (success or failure). |
2444 | __ BIND(L_done); |
2445 | __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); |
2446 | __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); |
2447 | __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); |
2448 | restore_arg_regs(); |
2449 | inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free |
2450 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2451 | __ ret(0); |
2452 | |
2453 | return start; |
2454 | } |
2455 | |
2456 | // |
2457 | // Generate 'unsafe' array copy stub |
2458 | // Though just as safe as the other stubs, it takes an unscaled |
2459 | // size_t argument instead of an element count. |
2460 | // |
2461 | // Input: |
2462 | // c_rarg0 - source array address |
2463 | // c_rarg1 - destination array address |
2464 | // c_rarg2 - byte count, treated as ssize_t, can be zero |
2465 | // |
2466 | // Examines the alignment of the operands and dispatches |
2467 | // to a long, int, short, or byte copy loop. |
2468 | // |
2469 | address generate_unsafe_copy(const char *name, |
2470 | address byte_copy_entry, address short_copy_entry, |
2471 | address int_copy_entry, address long_copy_entry) { |
2472 | |
2473 | Label L_long_aligned, L_int_aligned, L_short_aligned; |
2474 | |
2475 | // Input registers (before setup_arg_regs) |
2476 | const Register from = c_rarg0; // source array address |
2477 | const Register to = c_rarg1; // destination array address |
2478 | const Register size = c_rarg2; // byte count (size_t) |
2479 | |
2480 | // Register used as a temp |
2481 | const Register bits = rax; // test copy of low bits |
2482 | |
2483 | __ align(CodeEntryAlignment); |
2484 | StubCodeMark mark(this, "StubRoutines" , name); |
2485 | address start = __ pc(); |
2486 | |
2487 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
2488 | |
2489 | // bump this on entry, not on exit: |
2490 | inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); |
2491 | |
2492 | __ mov(bits, from); |
2493 | __ orptr(bits, to); |
2494 | __ orptr(bits, size); |
2495 | |
2496 | __ testb(bits, BytesPerLong-1); |
2497 | __ jccb(Assembler::zero, L_long_aligned); |
2498 | |
2499 | __ testb(bits, BytesPerInt-1); |
2500 | __ jccb(Assembler::zero, L_int_aligned); |
2501 | |
2502 | __ testb(bits, BytesPerShort-1); |
2503 | __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); |
2504 | |
2505 | __ BIND(L_short_aligned); |
2506 | __ shrptr(size, LogBytesPerShort); // size => short_count |
2507 | __ jump(RuntimeAddress(short_copy_entry)); |
2508 | |
2509 | __ BIND(L_int_aligned); |
2510 | __ shrptr(size, LogBytesPerInt); // size => int_count |
2511 | __ jump(RuntimeAddress(int_copy_entry)); |
2512 | |
2513 | __ BIND(L_long_aligned); |
2514 | __ shrptr(size, LogBytesPerLong); // size => qword_count |
2515 | __ jump(RuntimeAddress(long_copy_entry)); |
2516 | |
2517 | return start; |
2518 | } |
2519 | |
2520 | // Perform range checks on the proposed arraycopy. |
2521 | // Kills temp, but nothing else. |
2522 | // Also, clean the sign bits of src_pos and dst_pos. |
2523 | void arraycopy_range_checks(Register src, // source array oop (c_rarg0) |
2524 | Register src_pos, // source position (c_rarg1) |
2525 | Register dst, // destination array oo (c_rarg2) |
2526 | Register dst_pos, // destination position (c_rarg3) |
2527 | Register length, |
2528 | Register temp, |
2529 | Label& L_failed) { |
2530 | BLOCK_COMMENT("arraycopy_range_checks:" ); |
2531 | |
2532 | // if (src_pos + length > arrayOop(src)->length()) FAIL; |
2533 | __ movl(temp, length); |
2534 | __ addl(temp, src_pos); // src_pos + length |
2535 | __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); |
2536 | __ jcc(Assembler::above, L_failed); |
2537 | |
2538 | // if (dst_pos + length > arrayOop(dst)->length()) FAIL; |
2539 | __ movl(temp, length); |
2540 | __ addl(temp, dst_pos); // dst_pos + length |
2541 | __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); |
2542 | __ jcc(Assembler::above, L_failed); |
2543 | |
2544 | // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. |
2545 | // Move with sign extension can be used since they are positive. |
2546 | __ movslq(src_pos, src_pos); |
2547 | __ movslq(dst_pos, dst_pos); |
2548 | |
2549 | BLOCK_COMMENT("arraycopy_range_checks done" ); |
2550 | } |
2551 | |
2552 | // |
2553 | // Generate generic array copy stubs |
2554 | // |
2555 | // Input: |
2556 | // c_rarg0 - src oop |
2557 | // c_rarg1 - src_pos (32-bits) |
2558 | // c_rarg2 - dst oop |
2559 | // c_rarg3 - dst_pos (32-bits) |
2560 | // not Win64 |
2561 | // c_rarg4 - element count (32-bits) |
2562 | // Win64 |
2563 | // rsp+40 - element count (32-bits) |
2564 | // |
2565 | // Output: |
2566 | // rax == 0 - success |
2567 | // rax == -1^K - failure, where K is partial transfer count |
2568 | // |
2569 | address generate_generic_copy(const char *name, |
2570 | address byte_copy_entry, address short_copy_entry, |
2571 | address int_copy_entry, address oop_copy_entry, |
2572 | address long_copy_entry, address checkcast_copy_entry) { |
2573 | |
2574 | Label L_failed, L_failed_0, L_objArray; |
2575 | Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; |
2576 | |
2577 | // Input registers |
2578 | const Register src = c_rarg0; // source array oop |
2579 | const Register src_pos = c_rarg1; // source position |
2580 | const Register dst = c_rarg2; // destination array oop |
2581 | const Register dst_pos = c_rarg3; // destination position |
2582 | #ifndef _WIN64 |
2583 | const Register length = c_rarg4; |
2584 | #else |
2585 | const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64 |
2586 | #endif |
2587 | |
2588 | { int modulus = CodeEntryAlignment; |
2589 | int target = modulus - 5; // 5 = sizeof jmp(L_failed) |
2590 | int advance = target - (__ offset() % modulus); |
2591 | if (advance < 0) advance += modulus; |
2592 | if (advance > 0) __ nop(advance); |
2593 | } |
2594 | StubCodeMark mark(this, "StubRoutines" , name); |
2595 | |
2596 | // Short-hop target to L_failed. Makes for denser prologue code. |
2597 | __ BIND(L_failed_0); |
2598 | __ jmp(L_failed); |
2599 | assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed" ); |
2600 | |
2601 | __ align(CodeEntryAlignment); |
2602 | address start = __ pc(); |
2603 | |
2604 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
2605 | |
2606 | // bump this on entry, not on exit: |
2607 | inc_counter_np(SharedRuntime::_generic_array_copy_ctr); |
2608 | |
2609 | //----------------------------------------------------------------------- |
2610 | // Assembler stub will be used for this call to arraycopy |
2611 | // if the following conditions are met: |
2612 | // |
2613 | // (1) src and dst must not be null. |
2614 | // (2) src_pos must not be negative. |
2615 | // (3) dst_pos must not be negative. |
2616 | // (4) length must not be negative. |
2617 | // (5) src klass and dst klass should be the same and not NULL. |
2618 | // (6) src and dst should be arrays. |
2619 | // (7) src_pos + length must not exceed length of src. |
2620 | // (8) dst_pos + length must not exceed length of dst. |
2621 | // |
2622 | |
2623 | // if (src == NULL) return -1; |
2624 | __ testptr(src, src); // src oop |
2625 | size_t j1off = __ offset(); |
2626 | __ jccb(Assembler::zero, L_failed_0); |
2627 | |
2628 | // if (src_pos < 0) return -1; |
2629 | __ testl(src_pos, src_pos); // src_pos (32-bits) |
2630 | __ jccb(Assembler::negative, L_failed_0); |
2631 | |
2632 | // if (dst == NULL) return -1; |
2633 | __ testptr(dst, dst); // dst oop |
2634 | __ jccb(Assembler::zero, L_failed_0); |
2635 | |
2636 | // if (dst_pos < 0) return -1; |
2637 | __ testl(dst_pos, dst_pos); // dst_pos (32-bits) |
2638 | size_t j4off = __ offset(); |
2639 | __ jccb(Assembler::negative, L_failed_0); |
2640 | |
2641 | // The first four tests are very dense code, |
2642 | // but not quite dense enough to put four |
2643 | // jumps in a 16-byte instruction fetch buffer. |
2644 | // That's good, because some branch predicters |
2645 | // do not like jumps so close together. |
2646 | // Make sure of this. |
2647 | guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps" ); |
2648 | |
2649 | // registers used as temp |
2650 | const Register r11_length = r11; // elements count to copy |
2651 | const Register r10_src_klass = r10; // array klass |
2652 | |
2653 | // if (length < 0) return -1; |
2654 | __ movl(r11_length, length); // length (elements count, 32-bits value) |
2655 | __ testl(r11_length, r11_length); |
2656 | __ jccb(Assembler::negative, L_failed_0); |
2657 | |
2658 | __ load_klass(r10_src_klass, src); |
2659 | #ifdef ASSERT |
2660 | // assert(src->klass() != NULL); |
2661 | { |
2662 | BLOCK_COMMENT("assert klasses not null {" ); |
2663 | Label L1, L2; |
2664 | __ testptr(r10_src_klass, r10_src_klass); |
2665 | __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL |
2666 | __ bind(L1); |
2667 | __ stop("broken null klass" ); |
2668 | __ bind(L2); |
2669 | __ load_klass(rax, dst); |
2670 | __ cmpq(rax, 0); |
2671 | __ jcc(Assembler::equal, L1); // this would be broken also |
2672 | BLOCK_COMMENT("} assert klasses not null done" ); |
2673 | } |
2674 | #endif |
2675 | |
2676 | // Load layout helper (32-bits) |
2677 | // |
2678 | // |array_tag| | header_size | element_type | |log2_element_size| |
2679 | // 32 30 24 16 8 2 0 |
2680 | // |
2681 | // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 |
2682 | // |
2683 | |
2684 | const int lh_offset = in_bytes(Klass::layout_helper_offset()); |
2685 | |
2686 | // Handle objArrays completely differently... |
2687 | const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); |
2688 | __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); |
2689 | __ jcc(Assembler::equal, L_objArray); |
2690 | |
2691 | // if (src->klass() != dst->klass()) return -1; |
2692 | __ load_klass(rax, dst); |
2693 | __ cmpq(r10_src_klass, rax); |
2694 | __ jcc(Assembler::notEqual, L_failed); |
2695 | |
2696 | const Register rax_lh = rax; // layout helper |
2697 | __ movl(rax_lh, Address(r10_src_klass, lh_offset)); |
2698 | |
2699 | // if (!src->is_Array()) return -1; |
2700 | __ cmpl(rax_lh, Klass::_lh_neutral_value); |
2701 | __ jcc(Assembler::greaterEqual, L_failed); |
2702 | |
2703 | // At this point, it is known to be a typeArray (array_tag 0x3). |
2704 | #ifdef ASSERT |
2705 | { |
2706 | BLOCK_COMMENT("assert primitive array {" ); |
2707 | Label L; |
2708 | __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); |
2709 | __ jcc(Assembler::greaterEqual, L); |
2710 | __ stop("must be a primitive array" ); |
2711 | __ bind(L); |
2712 | BLOCK_COMMENT("} assert primitive array done" ); |
2713 | } |
2714 | #endif |
2715 | |
2716 | arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, |
2717 | r10, L_failed); |
2718 | |
2719 | // TypeArrayKlass |
2720 | // |
2721 | // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); |
2722 | // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); |
2723 | // |
2724 | |
2725 | const Register r10_offset = r10; // array offset |
2726 | const Register rax_elsize = rax_lh; // element size |
2727 | |
2728 | __ movl(r10_offset, rax_lh); |
2729 | __ shrl(r10_offset, Klass::_lh_header_size_shift); |
2730 | __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset |
2731 | __ addptr(src, r10_offset); // src array offset |
2732 | __ addptr(dst, r10_offset); // dst array offset |
2733 | BLOCK_COMMENT("choose copy loop based on element size" ); |
2734 | __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize |
2735 | |
2736 | // next registers should be set before the jump to corresponding stub |
2737 | const Register from = c_rarg0; // source array address |
2738 | const Register to = c_rarg1; // destination array address |
2739 | const Register count = c_rarg2; // elements count |
2740 | |
2741 | // 'from', 'to', 'count' registers should be set in such order |
2742 | // since they are the same as 'src', 'src_pos', 'dst'. |
2743 | |
2744 | __ BIND(L_copy_bytes); |
2745 | __ cmpl(rax_elsize, 0); |
2746 | __ jccb(Assembler::notEqual, L_copy_shorts); |
2747 | __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr |
2748 | __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr |
2749 | __ movl2ptr(count, r11_length); // length |
2750 | __ jump(RuntimeAddress(byte_copy_entry)); |
2751 | |
2752 | __ BIND(L_copy_shorts); |
2753 | __ cmpl(rax_elsize, LogBytesPerShort); |
2754 | __ jccb(Assembler::notEqual, L_copy_ints); |
2755 | __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr |
2756 | __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr |
2757 | __ movl2ptr(count, r11_length); // length |
2758 | __ jump(RuntimeAddress(short_copy_entry)); |
2759 | |
2760 | __ BIND(L_copy_ints); |
2761 | __ cmpl(rax_elsize, LogBytesPerInt); |
2762 | __ jccb(Assembler::notEqual, L_copy_longs); |
2763 | __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr |
2764 | __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr |
2765 | __ movl2ptr(count, r11_length); // length |
2766 | __ jump(RuntimeAddress(int_copy_entry)); |
2767 | |
2768 | __ BIND(L_copy_longs); |
2769 | #ifdef ASSERT |
2770 | { |
2771 | BLOCK_COMMENT("assert long copy {" ); |
2772 | Label L; |
2773 | __ cmpl(rax_elsize, LogBytesPerLong); |
2774 | __ jcc(Assembler::equal, L); |
2775 | __ stop("must be long copy, but elsize is wrong" ); |
2776 | __ bind(L); |
2777 | BLOCK_COMMENT("} assert long copy done" ); |
2778 | } |
2779 | #endif |
2780 | __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr |
2781 | __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr |
2782 | __ movl2ptr(count, r11_length); // length |
2783 | __ jump(RuntimeAddress(long_copy_entry)); |
2784 | |
2785 | // ObjArrayKlass |
2786 | __ BIND(L_objArray); |
2787 | // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] |
2788 | |
2789 | Label L_plain_copy, L_checkcast_copy; |
2790 | // test array classes for subtyping |
2791 | __ load_klass(rax, dst); |
2792 | __ cmpq(r10_src_klass, rax); // usual case is exact equality |
2793 | __ jcc(Assembler::notEqual, L_checkcast_copy); |
2794 | |
2795 | // Identically typed arrays can be copied without element-wise checks. |
2796 | arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, |
2797 | r10, L_failed); |
2798 | |
2799 | __ lea(from, Address(src, src_pos, TIMES_OOP, |
2800 | arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr |
2801 | __ lea(to, Address(dst, dst_pos, TIMES_OOP, |
2802 | arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr |
2803 | __ movl2ptr(count, r11_length); // length |
2804 | __ BIND(L_plain_copy); |
2805 | __ jump(RuntimeAddress(oop_copy_entry)); |
2806 | |
2807 | __ BIND(L_checkcast_copy); |
2808 | // live at this point: r10_src_klass, r11_length, rax (dst_klass) |
2809 | { |
2810 | // Before looking at dst.length, make sure dst is also an objArray. |
2811 | __ cmpl(Address(rax, lh_offset), objArray_lh); |
2812 | __ jcc(Assembler::notEqual, L_failed); |
2813 | |
2814 | // It is safe to examine both src.length and dst.length. |
2815 | arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, |
2816 | rax, L_failed); |
2817 | |
2818 | const Register r11_dst_klass = r11; |
2819 | __ load_klass(r11_dst_klass, dst); // reload |
2820 | |
2821 | // Marshal the base address arguments now, freeing registers. |
2822 | __ lea(from, Address(src, src_pos, TIMES_OOP, |
2823 | arrayOopDesc::base_offset_in_bytes(T_OBJECT))); |
2824 | __ lea(to, Address(dst, dst_pos, TIMES_OOP, |
2825 | arrayOopDesc::base_offset_in_bytes(T_OBJECT))); |
2826 | __ movl(count, length); // length (reloaded) |
2827 | Register sco_temp = c_rarg3; // this register is free now |
2828 | assert_different_registers(from, to, count, sco_temp, |
2829 | r11_dst_klass, r10_src_klass); |
2830 | assert_clean_int(count, sco_temp); |
2831 | |
2832 | // Generate the type check. |
2833 | const int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
2834 | __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); |
2835 | assert_clean_int(sco_temp, rax); |
2836 | generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); |
2837 | |
2838 | // Fetch destination element klass from the ObjArrayKlass header. |
2839 | int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); |
2840 | __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); |
2841 | __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); |
2842 | assert_clean_int(sco_temp, rax); |
2843 | |
2844 | // the checkcast_copy loop needs two extra arguments: |
2845 | assert(c_rarg3 == sco_temp, "#3 already in place" ); |
2846 | // Set up arguments for checkcast_copy_entry. |
2847 | setup_arg_regs(4); |
2848 | __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris |
2849 | __ jump(RuntimeAddress(checkcast_copy_entry)); |
2850 | } |
2851 | |
2852 | __ BIND(L_failed); |
2853 | __ xorptr(rax, rax); |
2854 | __ notptr(rax); // return -1 |
2855 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
2856 | __ ret(0); |
2857 | |
2858 | return start; |
2859 | } |
2860 | |
2861 | void generate_arraycopy_stubs() { |
2862 | address entry; |
2863 | address entry_jbyte_arraycopy; |
2864 | address entry_jshort_arraycopy; |
2865 | address entry_jint_arraycopy; |
2866 | address entry_oop_arraycopy; |
2867 | address entry_jlong_arraycopy; |
2868 | address entry_checkcast_arraycopy; |
2869 | |
2870 | StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, |
2871 | "jbyte_disjoint_arraycopy" ); |
2872 | StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, |
2873 | "jbyte_arraycopy" ); |
2874 | |
2875 | StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, |
2876 | "jshort_disjoint_arraycopy" ); |
2877 | StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, |
2878 | "jshort_arraycopy" ); |
2879 | |
2880 | StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, |
2881 | "jint_disjoint_arraycopy" ); |
2882 | StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, |
2883 | &entry_jint_arraycopy, "jint_arraycopy" ); |
2884 | |
2885 | StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, |
2886 | "jlong_disjoint_arraycopy" ); |
2887 | StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, |
2888 | &entry_jlong_arraycopy, "jlong_arraycopy" ); |
2889 | |
2890 | |
2891 | if (UseCompressedOops) { |
2892 | StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, |
2893 | "oop_disjoint_arraycopy" ); |
2894 | StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, |
2895 | &entry_oop_arraycopy, "oop_arraycopy" ); |
2896 | StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, |
2897 | "oop_disjoint_arraycopy_uninit" , |
2898 | /*dest_uninitialized*/true); |
2899 | StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, |
2900 | NULL, "oop_arraycopy_uninit" , |
2901 | /*dest_uninitialized*/true); |
2902 | } else { |
2903 | StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, |
2904 | "oop_disjoint_arraycopy" ); |
2905 | StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, |
2906 | &entry_oop_arraycopy, "oop_arraycopy" ); |
2907 | StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, |
2908 | "oop_disjoint_arraycopy_uninit" , |
2909 | /*dest_uninitialized*/true); |
2910 | StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, |
2911 | NULL, "oop_arraycopy_uninit" , |
2912 | /*dest_uninitialized*/true); |
2913 | } |
2914 | |
2915 | StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy" , &entry_checkcast_arraycopy); |
2916 | StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit" , NULL, |
2917 | /*dest_uninitialized*/true); |
2918 | |
2919 | StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy" , |
2920 | entry_jbyte_arraycopy, |
2921 | entry_jshort_arraycopy, |
2922 | entry_jint_arraycopy, |
2923 | entry_jlong_arraycopy); |
2924 | StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy" , |
2925 | entry_jbyte_arraycopy, |
2926 | entry_jshort_arraycopy, |
2927 | entry_jint_arraycopy, |
2928 | entry_oop_arraycopy, |
2929 | entry_jlong_arraycopy, |
2930 | entry_checkcast_arraycopy); |
2931 | |
2932 | StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill" ); |
2933 | StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill" ); |
2934 | StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill" ); |
2935 | StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill" ); |
2936 | StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill" ); |
2937 | StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill" ); |
2938 | |
2939 | // We don't generate specialized code for HeapWord-aligned source |
2940 | // arrays, so just use the code we've already generated |
2941 | StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; |
2942 | StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; |
2943 | |
2944 | StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; |
2945 | StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; |
2946 | |
2947 | StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; |
2948 | StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; |
2949 | |
2950 | StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; |
2951 | StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; |
2952 | |
2953 | StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; |
2954 | StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; |
2955 | |
2956 | StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; |
2957 | StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; |
2958 | } |
2959 | |
2960 | // AES intrinsic stubs |
2961 | enum {AESBlockSize = 16}; |
2962 | |
2963 | address generate_key_shuffle_mask() { |
2964 | __ align(16); |
2965 | StubCodeMark mark(this, "StubRoutines" , "key_shuffle_mask" ); |
2966 | address start = __ pc(); |
2967 | __ emit_data64( 0x0405060700010203, relocInfo::none ); |
2968 | __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); |
2969 | return start; |
2970 | } |
2971 | |
2972 | address generate_counter_shuffle_mask() { |
2973 | __ align(16); |
2974 | StubCodeMark mark(this, "StubRoutines" , "counter_shuffle_mask" ); |
2975 | address start = __ pc(); |
2976 | __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); |
2977 | __ emit_data64(0x0001020304050607, relocInfo::none); |
2978 | return start; |
2979 | } |
2980 | |
2981 | // Utility routine for loading a 128-bit key word in little endian format |
2982 | // can optionally specify that the shuffle mask is already in an xmmregister |
2983 | void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
2984 | __ movdqu(xmmdst, Address(key, offset)); |
2985 | if (xmm_shuf_mask != NULL) { |
2986 | __ pshufb(xmmdst, xmm_shuf_mask); |
2987 | } else { |
2988 | __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2989 | } |
2990 | } |
2991 | |
2992 | // Utility routine for increase 128bit counter (iv in CTR mode) |
2993 | void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { |
2994 | __ pextrq(reg, xmmdst, 0x0); |
2995 | __ addq(reg, inc_delta); |
2996 | __ pinsrq(xmmdst, reg, 0x0); |
2997 | __ jcc(Assembler::carryClear, next_block); // jump if no carry |
2998 | __ pextrq(reg, xmmdst, 0x01); // Carry |
2999 | __ addq(reg, 0x01); |
3000 | __ pinsrq(xmmdst, reg, 0x01); //Carry end |
3001 | __ BIND(next_block); // next instruction |
3002 | } |
3003 | |
3004 | // Arguments: |
3005 | // |
3006 | // Inputs: |
3007 | // c_rarg0 - source byte array address |
3008 | // c_rarg1 - destination byte array address |
3009 | // c_rarg2 - K (key) in little endian int array |
3010 | // |
3011 | address generate_aescrypt_encryptBlock() { |
3012 | assert(UseAES, "need AES instructions and misaligned SSE support" ); |
3013 | __ align(CodeEntryAlignment); |
3014 | StubCodeMark mark(this, "StubRoutines" , "aescrypt_encryptBlock" ); |
3015 | Label L_doLast; |
3016 | address start = __ pc(); |
3017 | |
3018 | const Register from = c_rarg0; // source array address |
3019 | const Register to = c_rarg1; // destination array address |
3020 | const Register key = c_rarg2; // key array address |
3021 | const Register keylen = rax; |
3022 | |
3023 | const XMMRegister xmm_result = xmm0; |
3024 | const XMMRegister xmm_key_shuf_mask = xmm1; |
3025 | // On win64 xmm6-xmm15 must be preserved so don't use them. |
3026 | const XMMRegister xmm_temp1 = xmm2; |
3027 | const XMMRegister xmm_temp2 = xmm3; |
3028 | const XMMRegister xmm_temp3 = xmm4; |
3029 | const XMMRegister xmm_temp4 = xmm5; |
3030 | |
3031 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
3032 | |
3033 | // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} |
3034 | __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3035 | |
3036 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3037 | __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input |
3038 | |
3039 | // For encryption, the java expanded key ordering is just what we need |
3040 | // we don't know if the key is aligned, hence not using load-execute form |
3041 | |
3042 | load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); |
3043 | __ pxor(xmm_result, xmm_temp1); |
3044 | |
3045 | load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); |
3046 | load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); |
3047 | load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); |
3048 | load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); |
3049 | |
3050 | __ aesenc(xmm_result, xmm_temp1); |
3051 | __ aesenc(xmm_result, xmm_temp2); |
3052 | __ aesenc(xmm_result, xmm_temp3); |
3053 | __ aesenc(xmm_result, xmm_temp4); |
3054 | |
3055 | load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); |
3056 | load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); |
3057 | load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); |
3058 | load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); |
3059 | |
3060 | __ aesenc(xmm_result, xmm_temp1); |
3061 | __ aesenc(xmm_result, xmm_temp2); |
3062 | __ aesenc(xmm_result, xmm_temp3); |
3063 | __ aesenc(xmm_result, xmm_temp4); |
3064 | |
3065 | load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); |
3066 | load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); |
3067 | |
3068 | __ cmpl(keylen, 44); |
3069 | __ jccb(Assembler::equal, L_doLast); |
3070 | |
3071 | __ aesenc(xmm_result, xmm_temp1); |
3072 | __ aesenc(xmm_result, xmm_temp2); |
3073 | |
3074 | load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); |
3075 | load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); |
3076 | |
3077 | __ cmpl(keylen, 52); |
3078 | __ jccb(Assembler::equal, L_doLast); |
3079 | |
3080 | __ aesenc(xmm_result, xmm_temp1); |
3081 | __ aesenc(xmm_result, xmm_temp2); |
3082 | |
3083 | load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); |
3084 | load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); |
3085 | |
3086 | __ BIND(L_doLast); |
3087 | __ aesenc(xmm_result, xmm_temp1); |
3088 | __ aesenclast(xmm_result, xmm_temp2); |
3089 | __ movdqu(Address(to, 0), xmm_result); // store the result |
3090 | __ xorptr(rax, rax); // return 0 |
3091 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
3092 | __ ret(0); |
3093 | |
3094 | return start; |
3095 | } |
3096 | |
3097 | |
3098 | // Arguments: |
3099 | // |
3100 | // Inputs: |
3101 | // c_rarg0 - source byte array address |
3102 | // c_rarg1 - destination byte array address |
3103 | // c_rarg2 - K (key) in little endian int array |
3104 | // |
3105 | address generate_aescrypt_decryptBlock() { |
3106 | assert(UseAES, "need AES instructions and misaligned SSE support" ); |
3107 | __ align(CodeEntryAlignment); |
3108 | StubCodeMark mark(this, "StubRoutines" , "aescrypt_decryptBlock" ); |
3109 | Label L_doLast; |
3110 | address start = __ pc(); |
3111 | |
3112 | const Register from = c_rarg0; // source array address |
3113 | const Register to = c_rarg1; // destination array address |
3114 | const Register key = c_rarg2; // key array address |
3115 | const Register keylen = rax; |
3116 | |
3117 | const XMMRegister xmm_result = xmm0; |
3118 | const XMMRegister xmm_key_shuf_mask = xmm1; |
3119 | // On win64 xmm6-xmm15 must be preserved so don't use them. |
3120 | const XMMRegister xmm_temp1 = xmm2; |
3121 | const XMMRegister xmm_temp2 = xmm3; |
3122 | const XMMRegister xmm_temp3 = xmm4; |
3123 | const XMMRegister xmm_temp4 = xmm5; |
3124 | |
3125 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
3126 | |
3127 | // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} |
3128 | __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3129 | |
3130 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3131 | __ movdqu(xmm_result, Address(from, 0)); |
3132 | |
3133 | // for decryption java expanded key ordering is rotated one position from what we want |
3134 | // so we start from 0x10 here and hit 0x00 last |
3135 | // we don't know if the key is aligned, hence not using load-execute form |
3136 | load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); |
3137 | load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); |
3138 | load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); |
3139 | load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); |
3140 | |
3141 | __ pxor (xmm_result, xmm_temp1); |
3142 | __ aesdec(xmm_result, xmm_temp2); |
3143 | __ aesdec(xmm_result, xmm_temp3); |
3144 | __ aesdec(xmm_result, xmm_temp4); |
3145 | |
3146 | load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); |
3147 | load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); |
3148 | load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); |
3149 | load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); |
3150 | |
3151 | __ aesdec(xmm_result, xmm_temp1); |
3152 | __ aesdec(xmm_result, xmm_temp2); |
3153 | __ aesdec(xmm_result, xmm_temp3); |
3154 | __ aesdec(xmm_result, xmm_temp4); |
3155 | |
3156 | load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); |
3157 | load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); |
3158 | load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); |
3159 | |
3160 | __ cmpl(keylen, 44); |
3161 | __ jccb(Assembler::equal, L_doLast); |
3162 | |
3163 | __ aesdec(xmm_result, xmm_temp1); |
3164 | __ aesdec(xmm_result, xmm_temp2); |
3165 | |
3166 | load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); |
3167 | load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); |
3168 | |
3169 | __ cmpl(keylen, 52); |
3170 | __ jccb(Assembler::equal, L_doLast); |
3171 | |
3172 | __ aesdec(xmm_result, xmm_temp1); |
3173 | __ aesdec(xmm_result, xmm_temp2); |
3174 | |
3175 | load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); |
3176 | load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); |
3177 | |
3178 | __ BIND(L_doLast); |
3179 | __ aesdec(xmm_result, xmm_temp1); |
3180 | __ aesdec(xmm_result, xmm_temp2); |
3181 | |
3182 | // for decryption the aesdeclast operation is always on key+0x00 |
3183 | __ aesdeclast(xmm_result, xmm_temp3); |
3184 | __ movdqu(Address(to, 0), xmm_result); // store the result |
3185 | __ xorptr(rax, rax); // return 0 |
3186 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
3187 | __ ret(0); |
3188 | |
3189 | return start; |
3190 | } |
3191 | |
3192 | |
3193 | // Arguments: |
3194 | // |
3195 | // Inputs: |
3196 | // c_rarg0 - source byte array address |
3197 | // c_rarg1 - destination byte array address |
3198 | // c_rarg2 - K (key) in little endian int array |
3199 | // c_rarg3 - r vector byte array address |
3200 | // c_rarg4 - input length |
3201 | // |
3202 | // Output: |
3203 | // rax - input length |
3204 | // |
3205 | address generate_cipherBlockChaining_encryptAESCrypt() { |
3206 | assert(UseAES, "need AES instructions and misaligned SSE support" ); |
3207 | __ align(CodeEntryAlignment); |
3208 | StubCodeMark mark(this, "StubRoutines" , "cipherBlockChaining_encryptAESCrypt" ); |
3209 | address start = __ pc(); |
3210 | |
3211 | Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; |
3212 | const Register from = c_rarg0; // source array address |
3213 | const Register to = c_rarg1; // destination array address |
3214 | const Register key = c_rarg2; // key array address |
3215 | const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
3216 | // and left with the results of the last encryption block |
3217 | #ifndef _WIN64 |
3218 | const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
3219 | #else |
3220 | const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
3221 | const Register len_reg = r11; // pick the volatile windows register |
3222 | #endif |
3223 | const Register pos = rax; |
3224 | |
3225 | // xmm register assignments for the loops below |
3226 | const XMMRegister xmm_result = xmm0; |
3227 | const XMMRegister xmm_temp = xmm1; |
3228 | // keys 0-10 preloaded into xmm2-xmm12 |
3229 | const int XMM_REG_NUM_KEY_FIRST = 2; |
3230 | const int XMM_REG_NUM_KEY_LAST = 15; |
3231 | const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
3232 | const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); |
3233 | const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); |
3234 | const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); |
3235 | const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); |
3236 | |
3237 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
3238 | |
3239 | #ifdef _WIN64 |
3240 | // on win64, fill len_reg from stack position |
3241 | __ movl(len_reg, len_mem); |
3242 | #else |
3243 | __ push(len_reg); // Save |
3244 | #endif |
3245 | |
3246 | const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front |
3247 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3248 | // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 |
3249 | for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { |
3250 | load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3251 | offset += 0x10; |
3252 | } |
3253 | __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec |
3254 | |
3255 | // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
3256 | __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3257 | __ cmpl(rax, 44); |
3258 | __ jcc(Assembler::notEqual, L_key_192_256); |
3259 | |
3260 | // 128 bit code follows here |
3261 | __ movptr(pos, 0); |
3262 | __ align(OptoLoopAlignment); |
3263 | |
3264 | __ BIND(L_loopTop_128); |
3265 | __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3266 | __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3267 | __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3268 | for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { |
3269 | __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3270 | } |
3271 | __ aesenclast(xmm_result, xmm_key10); |
3272 | __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3273 | // no need to store r to memory until we exit |
3274 | __ addptr(pos, AESBlockSize); |
3275 | __ subptr(len_reg, AESBlockSize); |
3276 | __ jcc(Assembler::notEqual, L_loopTop_128); |
3277 | |
3278 | __ BIND(L_exit); |
3279 | __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object |
3280 | |
3281 | #ifdef _WIN64 |
3282 | __ movl(rax, len_mem); |
3283 | #else |
3284 | __ pop(rax); // return length |
3285 | #endif |
3286 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
3287 | __ ret(0); |
3288 | |
3289 | __ BIND(L_key_192_256); |
3290 | // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
3291 | load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); |
3292 | load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); |
3293 | __ cmpl(rax, 52); |
3294 | __ jcc(Assembler::notEqual, L_key_256); |
3295 | |
3296 | // 192-bit code follows here (could be changed to use more xmm registers) |
3297 | __ movptr(pos, 0); |
3298 | __ align(OptoLoopAlignment); |
3299 | |
3300 | __ BIND(L_loopTop_192); |
3301 | __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3302 | __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3303 | __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3304 | for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { |
3305 | __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3306 | } |
3307 | __ aesenclast(xmm_result, xmm_key12); |
3308 | __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3309 | // no need to store r to memory until we exit |
3310 | __ addptr(pos, AESBlockSize); |
3311 | __ subptr(len_reg, AESBlockSize); |
3312 | __ jcc(Assembler::notEqual, L_loopTop_192); |
3313 | __ jmp(L_exit); |
3314 | |
3315 | __ BIND(L_key_256); |
3316 | // 256-bit code follows here (could be changed to use more xmm registers) |
3317 | load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); |
3318 | __ movptr(pos, 0); |
3319 | __ align(OptoLoopAlignment); |
3320 | |
3321 | __ BIND(L_loopTop_256); |
3322 | __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3323 | __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3324 | __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3325 | for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { |
3326 | __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3327 | } |
3328 | load_key(xmm_temp, key, 0xe0); |
3329 | __ aesenclast(xmm_result, xmm_temp); |
3330 | __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3331 | // no need to store r to memory until we exit |
3332 | __ addptr(pos, AESBlockSize); |
3333 | __ subptr(len_reg, AESBlockSize); |
3334 | __ jcc(Assembler::notEqual, L_loopTop_256); |
3335 | __ jmp(L_exit); |
3336 | |
3337 | return start; |
3338 | } |
3339 | |
3340 | // Safefetch stubs. |
3341 | void generate_safefetch(const char* name, int size, address* entry, |
3342 | address* fault_pc, address* continuation_pc) { |
3343 | // safefetch signatures: |
3344 | // int SafeFetch32(int* adr, int errValue); |
3345 | // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); |
3346 | // |
3347 | // arguments: |
3348 | // c_rarg0 = adr |
3349 | // c_rarg1 = errValue |
3350 | // |
3351 | // result: |
3352 | // PPC_RET = *adr or errValue |
3353 | |
3354 | StubCodeMark mark(this, "StubRoutines" , name); |
3355 | |
3356 | // Entry point, pc or function descriptor. |
3357 | *entry = __ pc(); |
3358 | |
3359 | // Load *adr into c_rarg1, may fault. |
3360 | *fault_pc = __ pc(); |
3361 | switch (size) { |
3362 | case 4: |
3363 | // int32_t |
3364 | __ movl(c_rarg1, Address(c_rarg0, 0)); |
3365 | break; |
3366 | case 8: |
3367 | // int64_t |
3368 | __ movq(c_rarg1, Address(c_rarg0, 0)); |
3369 | break; |
3370 | default: |
3371 | ShouldNotReachHere(); |
3372 | } |
3373 | |
3374 | // return errValue or *adr |
3375 | *continuation_pc = __ pc(); |
3376 | __ movq(rax, c_rarg1); |
3377 | __ ret(0); |
3378 | } |
3379 | |
3380 | // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time |
3381 | // to hide instruction latency |
3382 | // |
3383 | // Arguments: |
3384 | // |
3385 | // Inputs: |
3386 | // c_rarg0 - source byte array address |
3387 | // c_rarg1 - destination byte array address |
3388 | // c_rarg2 - K (key) in little endian int array |
3389 | // c_rarg3 - r vector byte array address |
3390 | // c_rarg4 - input length |
3391 | // |
3392 | // Output: |
3393 | // rax - input length |
3394 | // |
3395 | address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { |
3396 | assert(UseAES, "need AES instructions and misaligned SSE support" ); |
3397 | __ align(CodeEntryAlignment); |
3398 | StubCodeMark mark(this, "StubRoutines" , "cipherBlockChaining_decryptAESCrypt" ); |
3399 | address start = __ pc(); |
3400 | |
3401 | const Register from = c_rarg0; // source array address |
3402 | const Register to = c_rarg1; // destination array address |
3403 | const Register key = c_rarg2; // key array address |
3404 | const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
3405 | // and left with the results of the last encryption block |
3406 | #ifndef _WIN64 |
3407 | const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
3408 | #else |
3409 | const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
3410 | const Register len_reg = r11; // pick the volatile windows register |
3411 | #endif |
3412 | const Register pos = rax; |
3413 | |
3414 | const int PARALLEL_FACTOR = 4; |
3415 | const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256 |
3416 | |
3417 | Label L_exit; |
3418 | Label L_singleBlock_loopTopHead[3]; // 128, 192, 256 |
3419 | Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256 |
3420 | Label L_singleBlock_loopTop[3]; // 128, 192, 256 |
3421 | Label L_multiBlock_loopTopHead[3]; // 128, 192, 256 |
3422 | Label L_multiBlock_loopTop[3]; // 128, 192, 256 |
3423 | |
3424 | // keys 0-10 preloaded into xmm5-xmm15 |
3425 | const int XMM_REG_NUM_KEY_FIRST = 5; |
3426 | const int XMM_REG_NUM_KEY_LAST = 15; |
3427 | const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
3428 | const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); |
3429 | |
3430 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
3431 | |
3432 | #ifdef _WIN64 |
3433 | // on win64, fill len_reg from stack position |
3434 | __ movl(len_reg, len_mem); |
3435 | #else |
3436 | __ push(len_reg); // Save |
3437 | #endif |
3438 | __ push(rbx); |
3439 | // the java expanded key ordering is rotated one position from what we want |
3440 | // so we start from 0x10 here and hit 0x00 last |
3441 | const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
3442 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3443 | // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 |
3444 | for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { |
3445 | load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3446 | offset += 0x10; |
3447 | } |
3448 | load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); |
3449 | |
3450 | const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block |
3451 | |
3452 | // registers holding the four results in the parallelized loop |
3453 | const XMMRegister xmm_result0 = xmm0; |
3454 | const XMMRegister xmm_result1 = xmm2; |
3455 | const XMMRegister xmm_result2 = xmm3; |
3456 | const XMMRegister xmm_result3 = xmm4; |
3457 | |
3458 | __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec |
3459 | |
3460 | __ xorptr(pos, pos); |
3461 | |
3462 | // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
3463 | __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3464 | __ cmpl(rbx, 52); |
3465 | __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]); |
3466 | __ cmpl(rbx, 60); |
3467 | __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]); |
3468 | |
3469 | #define DoFour(opc, src_reg) \ |
3470 | __ opc(xmm_result0, src_reg); \ |
3471 | __ opc(xmm_result1, src_reg); \ |
3472 | __ opc(xmm_result2, src_reg); \ |
3473 | __ opc(xmm_result3, src_reg); \ |
3474 | |
3475 | for (int k = 0; k < 3; ++k) { |
3476 | __ BIND(L_multiBlock_loopTopHead[k]); |
3477 | if (k != 0) { |
3478 | __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left |
3479 | __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]); |
3480 | } |
3481 | if (k == 1) { |
3482 | __ subptr(rsp, 6 * wordSize); |
3483 | __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 |
3484 | load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0 |
3485 | __ movdqu(Address(rsp, 2 * wordSize), xmm15); |
3486 | load_key(xmm1, key, 0xc0); // 0xc0; |
3487 | __ movdqu(Address(rsp, 4 * wordSize), xmm1); |
3488 | } else if (k == 2) { |
3489 | __ subptr(rsp, 10 * wordSize); |
3490 | __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 |
3491 | load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0 |
3492 | __ movdqu(Address(rsp, 6 * wordSize), xmm15); |
3493 | load_key(xmm1, key, 0xe0); // 0xe0; |
3494 | __ movdqu(Address(rsp, 8 * wordSize), xmm1); |
3495 | load_key(xmm15, key, 0xb0); // 0xb0; |
3496 | __ movdqu(Address(rsp, 2 * wordSize), xmm15); |
3497 | load_key(xmm1, key, 0xc0); // 0xc0; |
3498 | __ movdqu(Address(rsp, 4 * wordSize), xmm1); |
3499 | } |
3500 | __ align(OptoLoopAlignment); |
3501 | __ BIND(L_multiBlock_loopTop[k]); |
3502 | __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left |
3503 | __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]); |
3504 | |
3505 | if (k != 0) { |
3506 | __ movdqu(xmm15, Address(rsp, 2 * wordSize)); |
3507 | __ movdqu(xmm1, Address(rsp, 4 * wordSize)); |
3508 | } |
3509 | |
3510 | __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers |
3511 | __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
3512 | __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
3513 | __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); |
3514 | |
3515 | DoFour(pxor, xmm_key_first); |
3516 | if (k == 0) { |
3517 | for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { |
3518 | DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); |
3519 | } |
3520 | DoFour(aesdeclast, xmm_key_last); |
3521 | } else if (k == 1) { |
3522 | for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) { |
3523 | DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); |
3524 | } |
3525 | __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. |
3526 | DoFour(aesdec, xmm1); // key : 0xc0 |
3527 | __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again |
3528 | DoFour(aesdeclast, xmm_key_last); |
3529 | } else if (k == 2) { |
3530 | for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) { |
3531 | DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); |
3532 | } |
3533 | DoFour(aesdec, xmm1); // key : 0xc0 |
3534 | __ movdqu(xmm15, Address(rsp, 6 * wordSize)); |
3535 | __ movdqu(xmm1, Address(rsp, 8 * wordSize)); |
3536 | DoFour(aesdec, xmm15); // key : 0xd0 |
3537 | __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. |
3538 | DoFour(aesdec, xmm1); // key : 0xe0 |
3539 | __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again |
3540 | DoFour(aesdeclast, xmm_key_last); |
3541 | } |
3542 | |
3543 | // for each result, xor with the r vector of previous cipher block |
3544 | __ pxor(xmm_result0, xmm_prev_block_cipher); |
3545 | __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
3546 | __ pxor(xmm_result1, xmm_prev_block_cipher); |
3547 | __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
3548 | __ pxor(xmm_result2, xmm_prev_block_cipher); |
3549 | __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
3550 | __ pxor(xmm_result3, xmm_prev_block_cipher); |
3551 | __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks |
3552 | if (k != 0) { |
3553 | __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher); |
3554 | } |
3555 | |
3556 | __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output |
3557 | __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); |
3558 | __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); |
3559 | __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); |
3560 | |
3561 | __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); |
3562 | __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); |
3563 | __ jmp(L_multiBlock_loopTop[k]); |
3564 | |
3565 | // registers used in the non-parallelized loops |
3566 | // xmm register assignments for the loops below |
3567 | const XMMRegister xmm_result = xmm0; |
3568 | const XMMRegister xmm_prev_block_cipher_save = xmm2; |
3569 | const XMMRegister xmm_key11 = xmm3; |
3570 | const XMMRegister xmm_key12 = xmm4; |
3571 | const XMMRegister key_tmp = xmm4; |
3572 | |
3573 | __ BIND(L_singleBlock_loopTopHead[k]); |
3574 | if (k == 1) { |
3575 | __ addptr(rsp, 6 * wordSize); |
3576 | } else if (k == 2) { |
3577 | __ addptr(rsp, 10 * wordSize); |
3578 | } |
3579 | __ cmpptr(len_reg, 0); // any blocks left?? |
3580 | __ jcc(Assembler::equal, L_exit); |
3581 | __ BIND(L_singleBlock_loopTopHead2[k]); |
3582 | if (k == 1) { |
3583 | load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0 |
3584 | load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0 |
3585 | } |
3586 | if (k == 2) { |
3587 | load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0 |
3588 | } |
3589 | __ align(OptoLoopAlignment); |
3590 | __ BIND(L_singleBlock_loopTop[k]); |
3591 | __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
3592 | __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
3593 | __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds |
3594 | for (int rnum = 1; rnum <= 9 ; rnum++) { |
3595 | __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); |
3596 | } |
3597 | if (k == 1) { |
3598 | __ aesdec(xmm_result, xmm_key11); |
3599 | __ aesdec(xmm_result, xmm_key12); |
3600 | } |
3601 | if (k == 2) { |
3602 | __ aesdec(xmm_result, xmm_key11); |
3603 | load_key(key_tmp, key, 0xc0); |
3604 | __ aesdec(xmm_result, key_tmp); |
3605 | load_key(key_tmp, key, 0xd0); |
3606 | __ aesdec(xmm_result, key_tmp); |
3607 | load_key(key_tmp, key, 0xe0); |
3608 | __ aesdec(xmm_result, key_tmp); |
3609 | } |
3610 | |
3611 | __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 |
3612 | __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
3613 | __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3614 | // no need to store r to memory until we exit |
3615 | __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
3616 | __ addptr(pos, AESBlockSize); |
3617 | __ subptr(len_reg, AESBlockSize); |
3618 | __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); |
3619 | if (k != 2) { |
3620 | __ jmp(L_exit); |
3621 | } |
3622 | } //for 128/192/256 |
3623 | |
3624 | __ BIND(L_exit); |
3625 | __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object |
3626 | __ pop(rbx); |
3627 | #ifdef _WIN64 |
3628 | __ movl(rax, len_mem); |
3629 | #else |
3630 | __ pop(rax); // return length |
3631 | #endif |
3632 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
3633 | __ ret(0); |
3634 | return start; |
3635 | } |
3636 | |
3637 | address generate_upper_word_mask() { |
3638 | __ align(64); |
3639 | StubCodeMark mark(this, "StubRoutines" , "upper_word_mask" ); |
3640 | address start = __ pc(); |
3641 | __ emit_data64(0x0000000000000000, relocInfo::none); |
3642 | __ emit_data64(0xFFFFFFFF00000000, relocInfo::none); |
3643 | return start; |
3644 | } |
3645 | |
3646 | address generate_shuffle_byte_flip_mask() { |
3647 | __ align(64); |
3648 | StubCodeMark mark(this, "StubRoutines" , "shuffle_byte_flip_mask" ); |
3649 | address start = __ pc(); |
3650 | __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); |
3651 | __ emit_data64(0x0001020304050607, relocInfo::none); |
3652 | return start; |
3653 | } |
3654 | |
3655 | // ofs and limit are use for multi-block byte array. |
3656 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
3657 | address generate_sha1_implCompress(bool multi_block, const char *name) { |
3658 | __ align(CodeEntryAlignment); |
3659 | StubCodeMark mark(this, "StubRoutines" , name); |
3660 | address start = __ pc(); |
3661 | |
3662 | Register buf = c_rarg0; |
3663 | Register state = c_rarg1; |
3664 | Register ofs = c_rarg2; |
3665 | Register limit = c_rarg3; |
3666 | |
3667 | const XMMRegister abcd = xmm0; |
3668 | const XMMRegister e0 = xmm1; |
3669 | const XMMRegister e1 = xmm2; |
3670 | const XMMRegister msg0 = xmm3; |
3671 | |
3672 | const XMMRegister msg1 = xmm4; |
3673 | const XMMRegister msg2 = xmm5; |
3674 | const XMMRegister msg3 = xmm6; |
3675 | const XMMRegister shuf_mask = xmm7; |
3676 | |
3677 | __ enter(); |
3678 | |
3679 | __ subptr(rsp, 4 * wordSize); |
3680 | |
3681 | __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, |
3682 | buf, state, ofs, limit, rsp, multi_block); |
3683 | |
3684 | __ addptr(rsp, 4 * wordSize); |
3685 | |
3686 | __ leave(); |
3687 | __ ret(0); |
3688 | return start; |
3689 | } |
3690 | |
3691 | address generate_pshuffle_byte_flip_mask() { |
3692 | __ align(64); |
3693 | StubCodeMark mark(this, "StubRoutines" , "pshuffle_byte_flip_mask" ); |
3694 | address start = __ pc(); |
3695 | __ emit_data64(0x0405060700010203, relocInfo::none); |
3696 | __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); |
3697 | |
3698 | if (VM_Version::supports_avx2()) { |
3699 | __ emit_data64(0x0405060700010203, relocInfo::none); // second copy |
3700 | __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); |
3701 | // _SHUF_00BA |
3702 | __ emit_data64(0x0b0a090803020100, relocInfo::none); |
3703 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3704 | __ emit_data64(0x0b0a090803020100, relocInfo::none); |
3705 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3706 | // _SHUF_DC00 |
3707 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3708 | __ emit_data64(0x0b0a090803020100, relocInfo::none); |
3709 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3710 | __ emit_data64(0x0b0a090803020100, relocInfo::none); |
3711 | } |
3712 | |
3713 | return start; |
3714 | } |
3715 | |
3716 | //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. |
3717 | address generate_pshuffle_byte_flip_mask_sha512() { |
3718 | __ align(32); |
3719 | StubCodeMark mark(this, "StubRoutines" , "pshuffle_byte_flip_mask_sha512" ); |
3720 | address start = __ pc(); |
3721 | if (VM_Version::supports_avx2()) { |
3722 | __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK |
3723 | __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); |
3724 | __ emit_data64(0x1011121314151617, relocInfo::none); |
3725 | __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none); |
3726 | __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO |
3727 | __ emit_data64(0x0000000000000000, relocInfo::none); |
3728 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3729 | __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); |
3730 | } |
3731 | |
3732 | return start; |
3733 | } |
3734 | |
3735 | // ofs and limit are use for multi-block byte array. |
3736 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
3737 | address generate_sha256_implCompress(bool multi_block, const char *name) { |
3738 | assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "" ); |
3739 | __ align(CodeEntryAlignment); |
3740 | StubCodeMark mark(this, "StubRoutines" , name); |
3741 | address start = __ pc(); |
3742 | |
3743 | Register buf = c_rarg0; |
3744 | Register state = c_rarg1; |
3745 | Register ofs = c_rarg2; |
3746 | Register limit = c_rarg3; |
3747 | |
3748 | const XMMRegister msg = xmm0; |
3749 | const XMMRegister state0 = xmm1; |
3750 | const XMMRegister state1 = xmm2; |
3751 | const XMMRegister msgtmp0 = xmm3; |
3752 | |
3753 | const XMMRegister msgtmp1 = xmm4; |
3754 | const XMMRegister msgtmp2 = xmm5; |
3755 | const XMMRegister msgtmp3 = xmm6; |
3756 | const XMMRegister msgtmp4 = xmm7; |
3757 | |
3758 | const XMMRegister shuf_mask = xmm8; |
3759 | |
3760 | __ enter(); |
3761 | |
3762 | __ subptr(rsp, 4 * wordSize); |
3763 | |
3764 | if (VM_Version::supports_sha()) { |
3765 | __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, |
3766 | buf, state, ofs, limit, rsp, multi_block, shuf_mask); |
3767 | } else if (VM_Version::supports_avx2()) { |
3768 | __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, |
3769 | buf, state, ofs, limit, rsp, multi_block, shuf_mask); |
3770 | } |
3771 | __ addptr(rsp, 4 * wordSize); |
3772 | __ vzeroupper(); |
3773 | __ leave(); |
3774 | __ ret(0); |
3775 | return start; |
3776 | } |
3777 | |
3778 | address generate_sha512_implCompress(bool multi_block, const char *name) { |
3779 | assert(VM_Version::supports_avx2(), "" ); |
3780 | assert(VM_Version::supports_bmi2(), "" ); |
3781 | __ align(CodeEntryAlignment); |
3782 | StubCodeMark mark(this, "StubRoutines" , name); |
3783 | address start = __ pc(); |
3784 | |
3785 | Register buf = c_rarg0; |
3786 | Register state = c_rarg1; |
3787 | Register ofs = c_rarg2; |
3788 | Register limit = c_rarg3; |
3789 | |
3790 | const XMMRegister msg = xmm0; |
3791 | const XMMRegister state0 = xmm1; |
3792 | const XMMRegister state1 = xmm2; |
3793 | const XMMRegister msgtmp0 = xmm3; |
3794 | const XMMRegister msgtmp1 = xmm4; |
3795 | const XMMRegister msgtmp2 = xmm5; |
3796 | const XMMRegister msgtmp3 = xmm6; |
3797 | const XMMRegister msgtmp4 = xmm7; |
3798 | |
3799 | const XMMRegister shuf_mask = xmm8; |
3800 | |
3801 | __ enter(); |
3802 | |
3803 | __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, |
3804 | buf, state, ofs, limit, rsp, multi_block, shuf_mask); |
3805 | |
3806 | __ vzeroupper(); |
3807 | __ leave(); |
3808 | __ ret(0); |
3809 | return start; |
3810 | } |
3811 | |
3812 | // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time |
3813 | // to hide instruction latency |
3814 | // |
3815 | // Arguments: |
3816 | // |
3817 | // Inputs: |
3818 | // c_rarg0 - source byte array address |
3819 | // c_rarg1 - destination byte array address |
3820 | // c_rarg2 - K (key) in little endian int array |
3821 | // c_rarg3 - counter vector byte array address |
3822 | // Linux |
3823 | // c_rarg4 - input length |
3824 | // c_rarg5 - saved encryptedCounter start |
3825 | // rbp + 6 * wordSize - saved used length |
3826 | // Windows |
3827 | // rbp + 6 * wordSize - input length |
3828 | // rbp + 7 * wordSize - saved encryptedCounter start |
3829 | // rbp + 8 * wordSize - saved used length |
3830 | // |
3831 | // Output: |
3832 | // rax - input length |
3833 | // |
3834 | address generate_counterMode_AESCrypt_Parallel() { |
3835 | assert(UseAES, "need AES instructions and misaligned SSE support" ); |
3836 | __ align(CodeEntryAlignment); |
3837 | StubCodeMark mark(this, "StubRoutines" , "counterMode_AESCrypt" ); |
3838 | address start = __ pc(); |
3839 | const Register from = c_rarg0; // source array address |
3840 | const Register to = c_rarg1; // destination array address |
3841 | const Register key = c_rarg2; // key array address |
3842 | const Register counter = c_rarg3; // counter byte array initialized from counter array address |
3843 | // and updated with the incremented counter in the end |
3844 | #ifndef _WIN64 |
3845 | const Register len_reg = c_rarg4; |
3846 | const Register saved_encCounter_start = c_rarg5; |
3847 | const Register used_addr = r10; |
3848 | const Address used_mem(rbp, 2 * wordSize); |
3849 | const Register used = r11; |
3850 | #else |
3851 | const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
3852 | const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 |
3853 | const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 |
3854 | const Register len_reg = r10; // pick the first volatile windows register |
3855 | const Register saved_encCounter_start = r11; |
3856 | const Register used_addr = r13; |
3857 | const Register used = r14; |
3858 | #endif |
3859 | const Register pos = rax; |
3860 | |
3861 | const int PARALLEL_FACTOR = 6; |
3862 | const XMMRegister xmm_counter_shuf_mask = xmm0; |
3863 | const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
3864 | const XMMRegister xmm_curr_counter = xmm2; |
3865 | |
3866 | const XMMRegister xmm_key_tmp0 = xmm3; |
3867 | const XMMRegister xmm_key_tmp1 = xmm4; |
3868 | |
3869 | // registers holding the four results in the parallelized loop |
3870 | const XMMRegister xmm_result0 = xmm5; |
3871 | const XMMRegister xmm_result1 = xmm6; |
3872 | const XMMRegister xmm_result2 = xmm7; |
3873 | const XMMRegister xmm_result3 = xmm8; |
3874 | const XMMRegister xmm_result4 = xmm9; |
3875 | const XMMRegister xmm_result5 = xmm10; |
3876 | |
3877 | const XMMRegister xmm_from0 = xmm11; |
3878 | const XMMRegister xmm_from1 = xmm12; |
3879 | const XMMRegister xmm_from2 = xmm13; |
3880 | const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. |
3881 | const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text |
3882 | const XMMRegister xmm_from5 = xmm4; |
3883 | |
3884 | //for key_128, key_192, key_256 |
3885 | const int rounds[3] = {10, 12, 14}; |
3886 | Label L_exit_preLoop, L_preLoop_start; |
3887 | Label L_multiBlock_loopTop[3]; |
3888 | Label L_singleBlockLoopTop[3]; |
3889 | Label L__incCounter[3][6]; //for 6 blocks |
3890 | Label L__incCounter_single[3]; //for single block, key128, key192, key256 |
3891 | Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; |
3892 | Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; |
3893 | |
3894 | Label L_exit; |
3895 | |
3896 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
3897 | |
3898 | #ifdef _WIN64 |
3899 | // allocate spill slots for r13, r14 |
3900 | enum { |
3901 | saved_r13_offset, |
3902 | saved_r14_offset |
3903 | }; |
3904 | __ subptr(rsp, 2 * wordSize); |
3905 | __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); |
3906 | __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); |
3907 | |
3908 | // on win64, fill len_reg from stack position |
3909 | __ movl(len_reg, len_mem); |
3910 | __ movptr(saved_encCounter_start, saved_encCounter_mem); |
3911 | __ movptr(used_addr, used_mem); |
3912 | __ movl(used, Address(used_addr, 0)); |
3913 | #else |
3914 | __ push(len_reg); // Save |
3915 | __ movptr(used_addr, used_mem); |
3916 | __ movl(used, Address(used_addr, 0)); |
3917 | #endif |
3918 | |
3919 | __ push(rbx); // Save RBX |
3920 | __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter |
3921 | __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch |
3922 | __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled |
3923 | __ movptr(pos, 0); |
3924 | |
3925 | // Use the partially used encrpyted counter from last invocation |
3926 | __ BIND(L_preLoop_start); |
3927 | __ cmpptr(used, 16); |
3928 | __ jcc(Assembler::aboveEqual, L_exit_preLoop); |
3929 | __ cmpptr(len_reg, 0); |
3930 | __ jcc(Assembler::lessEqual, L_exit_preLoop); |
3931 | __ movb(rbx, Address(saved_encCounter_start, used)); |
3932 | __ xorb(rbx, Address(from, pos)); |
3933 | __ movb(Address(to, pos), rbx); |
3934 | __ addptr(pos, 1); |
3935 | __ addptr(used, 1); |
3936 | __ subptr(len_reg, 1); |
3937 | |
3938 | __ jmp(L_preLoop_start); |
3939 | |
3940 | __ BIND(L_exit_preLoop); |
3941 | __ movl(Address(used_addr, 0), used); |
3942 | |
3943 | // key length could be only {11, 13, 15} * 4 = {44, 52, 60} |
3944 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch |
3945 | __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3946 | __ cmpl(rbx, 52); |
3947 | __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); |
3948 | __ cmpl(rbx, 60); |
3949 | __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); |
3950 | |
3951 | #define CTR_DoSix(opc, src_reg) \ |
3952 | __ opc(xmm_result0, src_reg); \ |
3953 | __ opc(xmm_result1, src_reg); \ |
3954 | __ opc(xmm_result2, src_reg); \ |
3955 | __ opc(xmm_result3, src_reg); \ |
3956 | __ opc(xmm_result4, src_reg); \ |
3957 | __ opc(xmm_result5, src_reg); |
3958 | |
3959 | // k == 0 : generate code for key_128 |
3960 | // k == 1 : generate code for key_192 |
3961 | // k == 2 : generate code for key_256 |
3962 | for (int k = 0; k < 3; ++k) { |
3963 | //multi blocks starts here |
3964 | __ align(OptoLoopAlignment); |
3965 | __ BIND(L_multiBlock_loopTop[k]); |
3966 | __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left |
3967 | __ jcc(Assembler::less, L_singleBlockLoopTop[k]); |
3968 | load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); |
3969 | |
3970 | //load, then increase counters |
3971 | CTR_DoSix(movdqa, xmm_curr_counter); |
3972 | inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); |
3973 | inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); |
3974 | inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); |
3975 | inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); |
3976 | inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); |
3977 | inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); |
3978 | CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR |
3979 | CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key |
3980 | |
3981 | //load two ROUND_KEYs at a time |
3982 | for (int i = 1; i < rounds[k]; ) { |
3983 | load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); |
3984 | load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); |
3985 | CTR_DoSix(aesenc, xmm_key_tmp1); |
3986 | i++; |
3987 | if (i != rounds[k]) { |
3988 | CTR_DoSix(aesenc, xmm_key_tmp0); |
3989 | } else { |
3990 | CTR_DoSix(aesenclast, xmm_key_tmp0); |
3991 | } |
3992 | i++; |
3993 | } |
3994 | |
3995 | // get next PARALLEL_FACTOR blocks into xmm_result registers |
3996 | __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
3997 | __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
3998 | __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
3999 | __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); |
4000 | __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); |
4001 | __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); |
4002 | |
4003 | __ pxor(xmm_result0, xmm_from0); |
4004 | __ pxor(xmm_result1, xmm_from1); |
4005 | __ pxor(xmm_result2, xmm_from2); |
4006 | __ pxor(xmm_result3, xmm_from3); |
4007 | __ pxor(xmm_result4, xmm_from4); |
4008 | __ pxor(xmm_result5, xmm_from5); |
4009 | |
4010 | // store 6 results into the next 64 bytes of output |
4011 | __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); |
4012 | __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); |
4013 | __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); |
4014 | __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); |
4015 | __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); |
4016 | __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); |
4017 | |
4018 | __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text |
4019 | __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length |
4020 | __ jmp(L_multiBlock_loopTop[k]); |
4021 | |
4022 | // singleBlock starts here |
4023 | __ align(OptoLoopAlignment); |
4024 | __ BIND(L_singleBlockLoopTop[k]); |
4025 | __ cmpptr(len_reg, 0); |
4026 | __ jcc(Assembler::lessEqual, L_exit); |
4027 | load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); |
4028 | __ movdqa(xmm_result0, xmm_curr_counter); |
4029 | inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); |
4030 | __ pshufb(xmm_result0, xmm_counter_shuf_mask); |
4031 | __ pxor(xmm_result0, xmm_key_tmp0); |
4032 | for (int i = 1; i < rounds[k]; i++) { |
4033 | load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); |
4034 | __ aesenc(xmm_result0, xmm_key_tmp0); |
4035 | } |
4036 | load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); |
4037 | __ aesenclast(xmm_result0, xmm_key_tmp0); |
4038 | __ cmpptr(len_reg, AESBlockSize); |
4039 | __ jcc(Assembler::less, L_processTail_insr[k]); |
4040 | __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
4041 | __ pxor(xmm_result0, xmm_from0); |
4042 | __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); |
4043 | __ addptr(pos, AESBlockSize); |
4044 | __ subptr(len_reg, AESBlockSize); |
4045 | __ jmp(L_singleBlockLoopTop[k]); |
4046 | __ BIND(L_processTail_insr[k]); // Process the tail part of the input array |
4047 | __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register |
4048 | __ testptr(len_reg, 8); |
4049 | __ jcc(Assembler::zero, L_processTail_4_insr[k]); |
4050 | __ subptr(pos,8); |
4051 | __ pinsrq(xmm_from0, Address(from, pos), 0); |
4052 | __ BIND(L_processTail_4_insr[k]); |
4053 | __ testptr(len_reg, 4); |
4054 | __ jcc(Assembler::zero, L_processTail_2_insr[k]); |
4055 | __ subptr(pos,4); |
4056 | __ pslldq(xmm_from0, 4); |
4057 | __ pinsrd(xmm_from0, Address(from, pos), 0); |
4058 | __ BIND(L_processTail_2_insr[k]); |
4059 | __ testptr(len_reg, 2); |
4060 | __ jcc(Assembler::zero, L_processTail_1_insr[k]); |
4061 | __ subptr(pos, 2); |
4062 | __ pslldq(xmm_from0, 2); |
4063 | __ pinsrw(xmm_from0, Address(from, pos), 0); |
4064 | __ BIND(L_processTail_1_insr[k]); |
4065 | __ testptr(len_reg, 1); |
4066 | __ jcc(Assembler::zero, L_processTail_exit_insr[k]); |
4067 | __ subptr(pos, 1); |
4068 | __ pslldq(xmm_from0, 1); |
4069 | __ pinsrb(xmm_from0, Address(from, pos), 0); |
4070 | __ BIND(L_processTail_exit_insr[k]); |
4071 | |
4072 | __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes. |
4073 | __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation. |
4074 | |
4075 | __ testptr(len_reg, 8); |
4076 | __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array |
4077 | __ pextrq(Address(to, pos), xmm_result0, 0); |
4078 | __ psrldq(xmm_result0, 8); |
4079 | __ addptr(pos, 8); |
4080 | __ BIND(L_processTail_4_extr[k]); |
4081 | __ testptr(len_reg, 4); |
4082 | __ jcc(Assembler::zero, L_processTail_2_extr[k]); |
4083 | __ pextrd(Address(to, pos), xmm_result0, 0); |
4084 | __ psrldq(xmm_result0, 4); |
4085 | __ addptr(pos, 4); |
4086 | __ BIND(L_processTail_2_extr[k]); |
4087 | __ testptr(len_reg, 2); |
4088 | __ jcc(Assembler::zero, L_processTail_1_extr[k]); |
4089 | __ pextrw(Address(to, pos), xmm_result0, 0); |
4090 | __ psrldq(xmm_result0, 2); |
4091 | __ addptr(pos, 2); |
4092 | __ BIND(L_processTail_1_extr[k]); |
4093 | __ testptr(len_reg, 1); |
4094 | __ jcc(Assembler::zero, L_processTail_exit_extr[k]); |
4095 | __ pextrb(Address(to, pos), xmm_result0, 0); |
4096 | |
4097 | __ BIND(L_processTail_exit_extr[k]); |
4098 | __ movl(Address(used_addr, 0), len_reg); |
4099 | __ jmp(L_exit); |
4100 | |
4101 | } |
4102 | |
4103 | __ BIND(L_exit); |
4104 | __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. |
4105 | __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back |
4106 | __ pop(rbx); // pop the saved RBX. |
4107 | #ifdef _WIN64 |
4108 | __ movl(rax, len_mem); |
4109 | __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); |
4110 | __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); |
4111 | __ addptr(rsp, 2 * wordSize); |
4112 | #else |
4113 | __ pop(rax); // return 'len' |
4114 | #endif |
4115 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
4116 | __ ret(0); |
4117 | return start; |
4118 | } |
4119 | |
4120 | void roundDec(XMMRegister xmm_reg) { |
4121 | __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); |
4122 | __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); |
4123 | __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); |
4124 | __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); |
4125 | __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); |
4126 | __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); |
4127 | __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); |
4128 | __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); |
4129 | } |
4130 | |
4131 | void roundDeclast(XMMRegister xmm_reg) { |
4132 | __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); |
4133 | __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); |
4134 | __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); |
4135 | __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); |
4136 | __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); |
4137 | __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); |
4138 | __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); |
4139 | __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); |
4140 | } |
4141 | |
4142 | void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) { |
4143 | __ movdqu(xmmdst, Address(key, offset)); |
4144 | if (xmm_shuf_mask != NULL) { |
4145 | __ pshufb(xmmdst, xmm_shuf_mask); |
4146 | } else { |
4147 | __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
4148 | } |
4149 | __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit); |
4150 | |
4151 | } |
4152 | |
4153 | address generate_cipherBlockChaining_decryptVectorAESCrypt() { |
4154 | assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support" ); |
4155 | __ align(CodeEntryAlignment); |
4156 | StubCodeMark mark(this, "StubRoutines" , "cipherBlockChaining_decryptAESCrypt" ); |
4157 | address start = __ pc(); |
4158 | |
4159 | const Register from = c_rarg0; // source array address |
4160 | const Register to = c_rarg1; // destination array address |
4161 | const Register key = c_rarg2; // key array address |
4162 | const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
4163 | // and left with the results of the last encryption block |
4164 | #ifndef _WIN64 |
4165 | const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
4166 | #else |
4167 | const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
4168 | const Register len_reg = r11; // pick the volatile windows register |
4169 | #endif |
4170 | |
4171 | Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop, |
4172 | Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit; |
4173 | |
4174 | __ enter(); |
4175 | |
4176 | #ifdef _WIN64 |
4177 | // on win64, fill len_reg from stack position |
4178 | __ movl(len_reg, len_mem); |
4179 | #else |
4180 | __ push(len_reg); // Save |
4181 | #endif |
4182 | __ push(rbx); |
4183 | __ vzeroupper(); |
4184 | |
4185 | // Temporary variable declaration for swapping key bytes |
4186 | const XMMRegister xmm_key_shuf_mask = xmm1; |
4187 | __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
4188 | |
4189 | // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds |
4190 | const Register rounds = rbx; |
4191 | __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
4192 | |
4193 | const XMMRegister IV = xmm0; |
4194 | // Load IV and broadcast value to 512-bits |
4195 | __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit); |
4196 | |
4197 | // Temporary variables for storing round keys |
4198 | const XMMRegister RK0 = xmm30; |
4199 | const XMMRegister RK1 = xmm9; |
4200 | const XMMRegister RK2 = xmm18; |
4201 | const XMMRegister RK3 = xmm19; |
4202 | const XMMRegister RK4 = xmm20; |
4203 | const XMMRegister RK5 = xmm21; |
4204 | const XMMRegister RK6 = xmm22; |
4205 | const XMMRegister RK7 = xmm23; |
4206 | const XMMRegister RK8 = xmm24; |
4207 | const XMMRegister RK9 = xmm25; |
4208 | const XMMRegister RK10 = xmm26; |
4209 | |
4210 | // Load and shuffle key |
4211 | // the java expanded key ordering is rotated one position from what we want |
4212 | // so we start from 1*16 here and hit 0*16 last |
4213 | ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask); |
4214 | ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask); |
4215 | ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask); |
4216 | ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask); |
4217 | ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask); |
4218 | ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask); |
4219 | ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask); |
4220 | ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask); |
4221 | ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask); |
4222 | ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask); |
4223 | ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask); |
4224 | |
4225 | // Variables for storing source cipher text |
4226 | const XMMRegister S0 = xmm10; |
4227 | const XMMRegister S1 = xmm11; |
4228 | const XMMRegister S2 = xmm12; |
4229 | const XMMRegister S3 = xmm13; |
4230 | const XMMRegister S4 = xmm14; |
4231 | const XMMRegister S5 = xmm15; |
4232 | const XMMRegister S6 = xmm16; |
4233 | const XMMRegister S7 = xmm17; |
4234 | |
4235 | // Variables for storing decrypted text |
4236 | const XMMRegister B0 = xmm1; |
4237 | const XMMRegister B1 = xmm2; |
4238 | const XMMRegister B2 = xmm3; |
4239 | const XMMRegister B3 = xmm4; |
4240 | const XMMRegister B4 = xmm5; |
4241 | const XMMRegister B5 = xmm6; |
4242 | const XMMRegister B6 = xmm7; |
4243 | const XMMRegister B7 = xmm8; |
4244 | |
4245 | __ cmpl(rounds, 44); |
4246 | __ jcc(Assembler::greater, KEY_192); |
4247 | __ jmp(Loop); |
4248 | |
4249 | __ BIND(KEY_192); |
4250 | const XMMRegister RK11 = xmm27; |
4251 | const XMMRegister RK12 = xmm28; |
4252 | ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask); |
4253 | ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask); |
4254 | |
4255 | __ cmpl(rounds, 52); |
4256 | __ jcc(Assembler::greater, KEY_256); |
4257 | __ jmp(Loop); |
4258 | |
4259 | __ BIND(KEY_256); |
4260 | const XMMRegister RK13 = xmm29; |
4261 | const XMMRegister RK14 = xmm31; |
4262 | ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask); |
4263 | ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask); |
4264 | |
4265 | __ BIND(Loop); |
4266 | __ cmpl(len_reg, 512); |
4267 | __ jcc(Assembler::below, Lcbc_dec_rem); |
4268 | __ BIND(Loop1); |
4269 | __ subl(len_reg, 512); |
4270 | __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit); |
4271 | __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit); |
4272 | __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit); |
4273 | __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit); |
4274 | __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit); |
4275 | __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit); |
4276 | __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit); |
4277 | __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit); |
4278 | __ leaq(from, Address(from, 8 * 64)); |
4279 | |
4280 | __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); |
4281 | __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit); |
4282 | __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit); |
4283 | __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit); |
4284 | __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit); |
4285 | __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit); |
4286 | __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit); |
4287 | __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit); |
4288 | |
4289 | __ evalignq(IV, S0, IV, 0x06); |
4290 | __ evalignq(S0, S1, S0, 0x06); |
4291 | __ evalignq(S1, S2, S1, 0x06); |
4292 | __ evalignq(S2, S3, S2, 0x06); |
4293 | __ evalignq(S3, S4, S3, 0x06); |
4294 | __ evalignq(S4, S5, S4, 0x06); |
4295 | __ evalignq(S5, S6, S5, 0x06); |
4296 | __ evalignq(S6, S7, S6, 0x06); |
4297 | |
4298 | roundDec(RK2); |
4299 | roundDec(RK3); |
4300 | roundDec(RK4); |
4301 | roundDec(RK5); |
4302 | roundDec(RK6); |
4303 | roundDec(RK7); |
4304 | roundDec(RK8); |
4305 | roundDec(RK9); |
4306 | roundDec(RK10); |
4307 | |
4308 | __ cmpl(rounds, 44); |
4309 | __ jcc(Assembler::belowEqual, L_128); |
4310 | roundDec(RK11); |
4311 | roundDec(RK12); |
4312 | |
4313 | __ cmpl(rounds, 52); |
4314 | __ jcc(Assembler::belowEqual, L_192); |
4315 | roundDec(RK13); |
4316 | roundDec(RK14); |
4317 | |
4318 | __ BIND(L_256); |
4319 | roundDeclast(RK0); |
4320 | __ jmp(Loop2); |
4321 | |
4322 | __ BIND(L_128); |
4323 | roundDeclast(RK0); |
4324 | __ jmp(Loop2); |
4325 | |
4326 | __ BIND(L_192); |
4327 | roundDeclast(RK0); |
4328 | |
4329 | __ BIND(Loop2); |
4330 | __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); |
4331 | __ evpxorq(B1, B1, S0, Assembler::AVX_512bit); |
4332 | __ evpxorq(B2, B2, S1, Assembler::AVX_512bit); |
4333 | __ evpxorq(B3, B3, S2, Assembler::AVX_512bit); |
4334 | __ evpxorq(B4, B4, S3, Assembler::AVX_512bit); |
4335 | __ evpxorq(B5, B5, S4, Assembler::AVX_512bit); |
4336 | __ evpxorq(B6, B6, S5, Assembler::AVX_512bit); |
4337 | __ evpxorq(B7, B7, S6, Assembler::AVX_512bit); |
4338 | __ evmovdquq(IV, S7, Assembler::AVX_512bit); |
4339 | |
4340 | __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit); |
4341 | __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit); |
4342 | __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit); |
4343 | __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit); |
4344 | __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit); |
4345 | __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit); |
4346 | __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit); |
4347 | __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit); |
4348 | __ leaq(to, Address(to, 8 * 64)); |
4349 | __ jmp(Loop); |
4350 | |
4351 | __ BIND(Lcbc_dec_rem); |
4352 | __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit); |
4353 | |
4354 | __ BIND(Lcbc_dec_rem_loop); |
4355 | __ subl(len_reg, 16); |
4356 | __ jcc(Assembler::carrySet, Lcbc_dec_ret); |
4357 | |
4358 | __ movdqu(S0, Address(from, 0)); |
4359 | __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); |
4360 | __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit); |
4361 | __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit); |
4362 | __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit); |
4363 | __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit); |
4364 | __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit); |
4365 | __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit); |
4366 | __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit); |
4367 | __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit); |
4368 | __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit); |
4369 | __ cmpl(rounds, 44); |
4370 | __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); |
4371 | |
4372 | __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit); |
4373 | __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit); |
4374 | __ cmpl(rounds, 52); |
4375 | __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); |
4376 | |
4377 | __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit); |
4378 | __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit); |
4379 | |
4380 | __ BIND(Lcbc_dec_rem_last); |
4381 | __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit); |
4382 | |
4383 | __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); |
4384 | __ evmovdquq(IV, S0, Assembler::AVX_512bit); |
4385 | __ movdqu(Address(to, 0), B0); |
4386 | __ leaq(from, Address(from, 16)); |
4387 | __ leaq(to, Address(to, 16)); |
4388 | __ jmp(Lcbc_dec_rem_loop); |
4389 | |
4390 | __ BIND(Lcbc_dec_ret); |
4391 | __ movdqu(Address(rvec, 0), IV); |
4392 | |
4393 | // Zero out the round keys |
4394 | __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit); |
4395 | __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit); |
4396 | __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit); |
4397 | __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit); |
4398 | __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit); |
4399 | __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit); |
4400 | __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit); |
4401 | __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit); |
4402 | __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit); |
4403 | __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit); |
4404 | __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit); |
4405 | __ cmpl(rounds, 44); |
4406 | __ jcc(Assembler::belowEqual, Lcbc_exit); |
4407 | __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit); |
4408 | __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit); |
4409 | __ cmpl(rounds, 52); |
4410 | __ jcc(Assembler::belowEqual, Lcbc_exit); |
4411 | __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit); |
4412 | __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit); |
4413 | |
4414 | __ BIND(Lcbc_exit); |
4415 | __ pop(rbx); |
4416 | #ifdef _WIN64 |
4417 | __ movl(rax, len_mem); |
4418 | #else |
4419 | __ pop(rax); // return length |
4420 | #endif |
4421 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
4422 | __ ret(0); |
4423 | return start; |
4424 | } |
4425 | |
4426 | // Polynomial x^128+x^127+x^126+x^121+1 |
4427 | address ghash_polynomial_addr() { |
4428 | __ align(CodeEntryAlignment); |
4429 | StubCodeMark mark(this, "StubRoutines" , "_ghash_poly_addr" ); |
4430 | address start = __ pc(); |
4431 | __ emit_data64(0x0000000000000001, relocInfo::none); |
4432 | __ emit_data64(0xc200000000000000, relocInfo::none); |
4433 | return start; |
4434 | } |
4435 | |
4436 | address ghash_shufflemask_addr() { |
4437 | __ align(CodeEntryAlignment); |
4438 | StubCodeMark mark(this, "StubRoutines" , "_ghash_shuffmask_addr" ); |
4439 | address start = __ pc(); |
4440 | __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); |
4441 | __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); |
4442 | return start; |
4443 | } |
4444 | |
4445 | // Ghash single and multi block operations using AVX instructions |
4446 | address generate_avx_ghash_processBlocks() { |
4447 | __ align(CodeEntryAlignment); |
4448 | |
4449 | StubCodeMark mark(this, "StubRoutines" , "ghash_processBlocks" ); |
4450 | address start = __ pc(); |
4451 | |
4452 | // arguments |
4453 | const Register state = c_rarg0; |
4454 | const Register htbl = c_rarg1; |
4455 | const Register data = c_rarg2; |
4456 | const Register blocks = c_rarg3; |
4457 | __ enter(); |
4458 | // Save state before entering routine |
4459 | __ avx_ghash(state, htbl, data, blocks); |
4460 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
4461 | __ ret(0); |
4462 | return start; |
4463 | } |
4464 | |
4465 | // byte swap x86 long |
4466 | address generate_ghash_long_swap_mask() { |
4467 | __ align(CodeEntryAlignment); |
4468 | StubCodeMark mark(this, "StubRoutines" , "ghash_long_swap_mask" ); |
4469 | address start = __ pc(); |
4470 | __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); |
4471 | __ emit_data64(0x0706050403020100, relocInfo::none ); |
4472 | return start; |
4473 | } |
4474 | |
4475 | // byte swap x86 byte array |
4476 | address generate_ghash_byte_swap_mask() { |
4477 | __ align(CodeEntryAlignment); |
4478 | StubCodeMark mark(this, "StubRoutines" , "ghash_byte_swap_mask" ); |
4479 | address start = __ pc(); |
4480 | __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); |
4481 | __ emit_data64(0x0001020304050607, relocInfo::none ); |
4482 | return start; |
4483 | } |
4484 | |
4485 | /* Single and multi-block ghash operations */ |
4486 | address generate_ghash_processBlocks() { |
4487 | __ align(CodeEntryAlignment); |
4488 | Label L_ghash_loop, L_exit; |
4489 | StubCodeMark mark(this, "StubRoutines" , "ghash_processBlocks" ); |
4490 | address start = __ pc(); |
4491 | |
4492 | const Register state = c_rarg0; |
4493 | const Register subkeyH = c_rarg1; |
4494 | const Register data = c_rarg2; |
4495 | const Register blocks = c_rarg3; |
4496 | |
4497 | const XMMRegister xmm_temp0 = xmm0; |
4498 | const XMMRegister xmm_temp1 = xmm1; |
4499 | const XMMRegister xmm_temp2 = xmm2; |
4500 | const XMMRegister xmm_temp3 = xmm3; |
4501 | const XMMRegister xmm_temp4 = xmm4; |
4502 | const XMMRegister xmm_temp5 = xmm5; |
4503 | const XMMRegister xmm_temp6 = xmm6; |
4504 | const XMMRegister xmm_temp7 = xmm7; |
4505 | const XMMRegister xmm_temp8 = xmm8; |
4506 | const XMMRegister xmm_temp9 = xmm9; |
4507 | const XMMRegister xmm_temp10 = xmm10; |
4508 | |
4509 | __ enter(); |
4510 | |
4511 | __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
4512 | |
4513 | __ movdqu(xmm_temp0, Address(state, 0)); |
4514 | __ pshufb(xmm_temp0, xmm_temp10); |
4515 | |
4516 | |
4517 | __ BIND(L_ghash_loop); |
4518 | __ movdqu(xmm_temp2, Address(data, 0)); |
4519 | __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
4520 | |
4521 | __ movdqu(xmm_temp1, Address(subkeyH, 0)); |
4522 | __ pshufb(xmm_temp1, xmm_temp10); |
4523 | |
4524 | __ pxor(xmm_temp0, xmm_temp2); |
4525 | |
4526 | // |
4527 | // Multiply with the hash key |
4528 | // |
4529 | __ movdqu(xmm_temp3, xmm_temp0); |
4530 | __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 |
4531 | __ movdqu(xmm_temp4, xmm_temp0); |
4532 | __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 |
4533 | |
4534 | __ movdqu(xmm_temp5, xmm_temp0); |
4535 | __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 |
4536 | __ movdqu(xmm_temp6, xmm_temp0); |
4537 | __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 |
4538 | |
4539 | __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 |
4540 | |
4541 | __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 |
4542 | __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right |
4543 | __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left |
4544 | __ pxor(xmm_temp3, xmm_temp5); |
4545 | __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result |
4546 | // of the carry-less multiplication of |
4547 | // xmm0 by xmm1. |
4548 | |
4549 | // We shift the result of the multiplication by one bit position |
4550 | // to the left to cope for the fact that the bits are reversed. |
4551 | __ movdqu(xmm_temp7, xmm_temp3); |
4552 | __ movdqu(xmm_temp8, xmm_temp6); |
4553 | __ pslld(xmm_temp3, 1); |
4554 | __ pslld(xmm_temp6, 1); |
4555 | __ psrld(xmm_temp7, 31); |
4556 | __ psrld(xmm_temp8, 31); |
4557 | __ movdqu(xmm_temp9, xmm_temp7); |
4558 | __ pslldq(xmm_temp8, 4); |
4559 | __ pslldq(xmm_temp7, 4); |
4560 | __ psrldq(xmm_temp9, 12); |
4561 | __ por(xmm_temp3, xmm_temp7); |
4562 | __ por(xmm_temp6, xmm_temp8); |
4563 | __ por(xmm_temp6, xmm_temp9); |
4564 | |
4565 | // |
4566 | // First phase of the reduction |
4567 | // |
4568 | // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts |
4569 | // independently. |
4570 | __ movdqu(xmm_temp7, xmm_temp3); |
4571 | __ movdqu(xmm_temp8, xmm_temp3); |
4572 | __ movdqu(xmm_temp9, xmm_temp3); |
4573 | __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 |
4574 | __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 |
4575 | __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 |
4576 | __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions |
4577 | __ pxor(xmm_temp7, xmm_temp9); |
4578 | __ movdqu(xmm_temp8, xmm_temp7); |
4579 | __ pslldq(xmm_temp7, 12); |
4580 | __ psrldq(xmm_temp8, 4); |
4581 | __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete |
4582 | |
4583 | // |
4584 | // Second phase of the reduction |
4585 | // |
4586 | // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these |
4587 | // shift operations. |
4588 | __ movdqu(xmm_temp2, xmm_temp3); |
4589 | __ movdqu(xmm_temp4, xmm_temp3); |
4590 | __ movdqu(xmm_temp5, xmm_temp3); |
4591 | __ psrld(xmm_temp2, 1); // packed left shifting >> 1 |
4592 | __ psrld(xmm_temp4, 2); // packed left shifting >> 2 |
4593 | __ psrld(xmm_temp5, 7); // packed left shifting >> 7 |
4594 | __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions |
4595 | __ pxor(xmm_temp2, xmm_temp5); |
4596 | __ pxor(xmm_temp2, xmm_temp8); |
4597 | __ pxor(xmm_temp3, xmm_temp2); |
4598 | __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 |
4599 | |
4600 | __ decrement(blocks); |
4601 | __ jcc(Assembler::zero, L_exit); |
4602 | __ movdqu(xmm_temp0, xmm_temp6); |
4603 | __ addptr(data, 16); |
4604 | __ jmp(L_ghash_loop); |
4605 | |
4606 | __ BIND(L_exit); |
4607 | __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result |
4608 | __ movdqu(Address(state, 0), xmm_temp6); // store the result |
4609 | __ leave(); |
4610 | __ ret(0); |
4611 | return start; |
4612 | } |
4613 | |
4614 | //base64 character set |
4615 | address base64_charset_addr() { |
4616 | __ align(CodeEntryAlignment); |
4617 | StubCodeMark mark(this, "StubRoutines" , "base64_charset" ); |
4618 | address start = __ pc(); |
4619 | __ emit_data64(0x0000004200000041, relocInfo::none); |
4620 | __ emit_data64(0x0000004400000043, relocInfo::none); |
4621 | __ emit_data64(0x0000004600000045, relocInfo::none); |
4622 | __ emit_data64(0x0000004800000047, relocInfo::none); |
4623 | __ emit_data64(0x0000004a00000049, relocInfo::none); |
4624 | __ emit_data64(0x0000004c0000004b, relocInfo::none); |
4625 | __ emit_data64(0x0000004e0000004d, relocInfo::none); |
4626 | __ emit_data64(0x000000500000004f, relocInfo::none); |
4627 | __ emit_data64(0x0000005200000051, relocInfo::none); |
4628 | __ emit_data64(0x0000005400000053, relocInfo::none); |
4629 | __ emit_data64(0x0000005600000055, relocInfo::none); |
4630 | __ emit_data64(0x0000005800000057, relocInfo::none); |
4631 | __ emit_data64(0x0000005a00000059, relocInfo::none); |
4632 | __ emit_data64(0x0000006200000061, relocInfo::none); |
4633 | __ emit_data64(0x0000006400000063, relocInfo::none); |
4634 | __ emit_data64(0x0000006600000065, relocInfo::none); |
4635 | __ emit_data64(0x0000006800000067, relocInfo::none); |
4636 | __ emit_data64(0x0000006a00000069, relocInfo::none); |
4637 | __ emit_data64(0x0000006c0000006b, relocInfo::none); |
4638 | __ emit_data64(0x0000006e0000006d, relocInfo::none); |
4639 | __ emit_data64(0x000000700000006f, relocInfo::none); |
4640 | __ emit_data64(0x0000007200000071, relocInfo::none); |
4641 | __ emit_data64(0x0000007400000073, relocInfo::none); |
4642 | __ emit_data64(0x0000007600000075, relocInfo::none); |
4643 | __ emit_data64(0x0000007800000077, relocInfo::none); |
4644 | __ emit_data64(0x0000007a00000079, relocInfo::none); |
4645 | __ emit_data64(0x0000003100000030, relocInfo::none); |
4646 | __ emit_data64(0x0000003300000032, relocInfo::none); |
4647 | __ emit_data64(0x0000003500000034, relocInfo::none); |
4648 | __ emit_data64(0x0000003700000036, relocInfo::none); |
4649 | __ emit_data64(0x0000003900000038, relocInfo::none); |
4650 | __ emit_data64(0x0000002f0000002b, relocInfo::none); |
4651 | return start; |
4652 | } |
4653 | |
4654 | //base64 url character set |
4655 | address base64url_charset_addr() { |
4656 | __ align(CodeEntryAlignment); |
4657 | StubCodeMark mark(this, "StubRoutines" , "base64url_charset" ); |
4658 | address start = __ pc(); |
4659 | __ emit_data64(0x0000004200000041, relocInfo::none); |
4660 | __ emit_data64(0x0000004400000043, relocInfo::none); |
4661 | __ emit_data64(0x0000004600000045, relocInfo::none); |
4662 | __ emit_data64(0x0000004800000047, relocInfo::none); |
4663 | __ emit_data64(0x0000004a00000049, relocInfo::none); |
4664 | __ emit_data64(0x0000004c0000004b, relocInfo::none); |
4665 | __ emit_data64(0x0000004e0000004d, relocInfo::none); |
4666 | __ emit_data64(0x000000500000004f, relocInfo::none); |
4667 | __ emit_data64(0x0000005200000051, relocInfo::none); |
4668 | __ emit_data64(0x0000005400000053, relocInfo::none); |
4669 | __ emit_data64(0x0000005600000055, relocInfo::none); |
4670 | __ emit_data64(0x0000005800000057, relocInfo::none); |
4671 | __ emit_data64(0x0000005a00000059, relocInfo::none); |
4672 | __ emit_data64(0x0000006200000061, relocInfo::none); |
4673 | __ emit_data64(0x0000006400000063, relocInfo::none); |
4674 | __ emit_data64(0x0000006600000065, relocInfo::none); |
4675 | __ emit_data64(0x0000006800000067, relocInfo::none); |
4676 | __ emit_data64(0x0000006a00000069, relocInfo::none); |
4677 | __ emit_data64(0x0000006c0000006b, relocInfo::none); |
4678 | __ emit_data64(0x0000006e0000006d, relocInfo::none); |
4679 | __ emit_data64(0x000000700000006f, relocInfo::none); |
4680 | __ emit_data64(0x0000007200000071, relocInfo::none); |
4681 | __ emit_data64(0x0000007400000073, relocInfo::none); |
4682 | __ emit_data64(0x0000007600000075, relocInfo::none); |
4683 | __ emit_data64(0x0000007800000077, relocInfo::none); |
4684 | __ emit_data64(0x0000007a00000079, relocInfo::none); |
4685 | __ emit_data64(0x0000003100000030, relocInfo::none); |
4686 | __ emit_data64(0x0000003300000032, relocInfo::none); |
4687 | __ emit_data64(0x0000003500000034, relocInfo::none); |
4688 | __ emit_data64(0x0000003700000036, relocInfo::none); |
4689 | __ emit_data64(0x0000003900000038, relocInfo::none); |
4690 | __ emit_data64(0x0000005f0000002d, relocInfo::none); |
4691 | |
4692 | return start; |
4693 | } |
4694 | |
4695 | address base64_bswap_mask_addr() { |
4696 | __ align(CodeEntryAlignment); |
4697 | StubCodeMark mark(this, "StubRoutines" , "bswap_mask_base64" ); |
4698 | address start = __ pc(); |
4699 | __ emit_data64(0x0504038002010080, relocInfo::none); |
4700 | __ emit_data64(0x0b0a098008070680, relocInfo::none); |
4701 | __ emit_data64(0x0908078006050480, relocInfo::none); |
4702 | __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none); |
4703 | __ emit_data64(0x0605048003020180, relocInfo::none); |
4704 | __ emit_data64(0x0c0b0a8009080780, relocInfo::none); |
4705 | __ emit_data64(0x0504038002010080, relocInfo::none); |
4706 | __ emit_data64(0x0b0a098008070680, relocInfo::none); |
4707 | |
4708 | return start; |
4709 | } |
4710 | |
4711 | address base64_right_shift_mask_addr() { |
4712 | __ align(CodeEntryAlignment); |
4713 | StubCodeMark mark(this, "StubRoutines" , "right_shift_mask" ); |
4714 | address start = __ pc(); |
4715 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4716 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4717 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4718 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4719 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4720 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4721 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4722 | __ emit_data64(0x0006000400020000, relocInfo::none); |
4723 | |
4724 | return start; |
4725 | } |
4726 | |
4727 | address base64_left_shift_mask_addr() { |
4728 | __ align(CodeEntryAlignment); |
4729 | StubCodeMark mark(this, "StubRoutines" , "left_shift_mask" ); |
4730 | address start = __ pc(); |
4731 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4732 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4733 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4734 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4735 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4736 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4737 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4738 | __ emit_data64(0x0000000200040000, relocInfo::none); |
4739 | |
4740 | return start; |
4741 | } |
4742 | |
4743 | address base64_and_mask_addr() { |
4744 | __ align(CodeEntryAlignment); |
4745 | StubCodeMark mark(this, "StubRoutines" , "and_mask" ); |
4746 | address start = __ pc(); |
4747 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4748 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4749 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4750 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4751 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4752 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4753 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4754 | __ emit_data64(0x3f003f003f000000, relocInfo::none); |
4755 | return start; |
4756 | } |
4757 | |
4758 | address base64_gather_mask_addr() { |
4759 | __ align(CodeEntryAlignment); |
4760 | StubCodeMark mark(this, "StubRoutines" , "gather_mask" ); |
4761 | address start = __ pc(); |
4762 | __ emit_data64(0xffffffffffffffff, relocInfo::none); |
4763 | return start; |
4764 | } |
4765 | |
4766 | // Code for generating Base64 encoding. |
4767 | // Intrinsic function prototype in Base64.java: |
4768 | // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) { |
4769 | address generate_base64_encodeBlock() { |
4770 | __ align(CodeEntryAlignment); |
4771 | StubCodeMark mark(this, "StubRoutines" , "implEncode" ); |
4772 | address start = __ pc(); |
4773 | __ enter(); |
4774 | |
4775 | // Save callee-saved registers before using them |
4776 | __ push(r12); |
4777 | __ push(r13); |
4778 | __ push(r14); |
4779 | __ push(r15); |
4780 | |
4781 | // arguments |
4782 | const Register source = c_rarg0; // Source Array |
4783 | const Register start_offset = c_rarg1; // start offset |
4784 | const Register end_offset = c_rarg2; // end offset |
4785 | const Register dest = c_rarg3; // destination array |
4786 | |
4787 | #ifndef _WIN64 |
4788 | const Register dp = c_rarg4; // Position for writing to dest array |
4789 | const Register isURL = c_rarg5;// Base64 or URL character set |
4790 | #else |
4791 | const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
4792 | const Address isURL_mem(rbp, 7 * wordSize); |
4793 | const Register isURL = r10; // pick the volatile windows register |
4794 | const Register dp = r12; |
4795 | __ movl(dp, dp_mem); |
4796 | __ movl(isURL, isURL_mem); |
4797 | #endif |
4798 | |
4799 | const Register length = r14; |
4800 | Label L_process80, L_process32, L_process3, L_exit, L_processdata; |
4801 | |
4802 | // calculate length from offsets |
4803 | __ movl(length, end_offset); |
4804 | __ subl(length, start_offset); |
4805 | __ cmpl(length, 0); |
4806 | __ jcc(Assembler::lessEqual, L_exit); |
4807 | |
4808 | __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr())); |
4809 | // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded |
4810 | __ cmpl(isURL, 0); |
4811 | __ jcc(Assembler::equal, L_processdata); |
4812 | __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr())); |
4813 | |
4814 | // load masks required for encoding data |
4815 | __ BIND(L_processdata); |
4816 | __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr())); |
4817 | // Set 64 bits of K register. |
4818 | __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit); |
4819 | __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13); |
4820 | __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13); |
4821 | __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13); |
4822 | __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13); |
4823 | |
4824 | // Vector Base64 implementation, producing 96 bytes of encoded data |
4825 | __ BIND(L_process80); |
4826 | __ cmpl(length, 80); |
4827 | __ jcc(Assembler::below, L_process32); |
4828 | __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit); |
4829 | __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit); |
4830 | __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit); |
4831 | |
4832 | //permute the input data in such a manner that we have continuity of the source |
4833 | __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit); |
4834 | __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit); |
4835 | __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit); |
4836 | |
4837 | //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte. |
4838 | //we can deal with 12 bytes at a time in a 128 bit register |
4839 | __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit); |
4840 | __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit); |
4841 | __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit); |
4842 | |
4843 | //convert byte to word. Each 128 bit register will have 6 bytes for processing |
4844 | __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit); |
4845 | __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit); |
4846 | __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit); |
4847 | |
4848 | // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers |
4849 | __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit); |
4850 | __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit); |
4851 | __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit); |
4852 | |
4853 | __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit); |
4854 | __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit); |
4855 | __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit); |
4856 | |
4857 | __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit); |
4858 | __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); |
4859 | __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); |
4860 | |
4861 | __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); |
4862 | __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit); |
4863 | __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit); |
4864 | |
4865 | __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); |
4866 | __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit); |
4867 | __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit); |
4868 | |
4869 | // Get the final 4*6 bits base64 encoding |
4870 | __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit); |
4871 | __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit); |
4872 | __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit); |
4873 | |
4874 | // Shift |
4875 | __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit); |
4876 | __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit); |
4877 | __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit); |
4878 | |
4879 | // look up 6 bits in the base64 character set to fetch the encoding |
4880 | // we are converting word to dword as gather instructions need dword indices for looking up encoding |
4881 | __ vextracti64x4(xmm6, xmm3, 0); |
4882 | __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit); |
4883 | __ vextracti64x4(xmm6, xmm3, 1); |
4884 | __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit); |
4885 | |
4886 | __ vextracti64x4(xmm6, xmm4, 0); |
4887 | __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit); |
4888 | __ vextracti64x4(xmm6, xmm4, 1); |
4889 | __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit); |
4890 | |
4891 | __ vextracti64x4(xmm4, xmm5, 0); |
4892 | __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit); |
4893 | |
4894 | __ vextracti64x4(xmm4, xmm5, 1); |
4895 | __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit); |
4896 | |
4897 | __ kmovql(k2, k3); |
4898 | __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit); |
4899 | __ kmovql(k2, k3); |
4900 | __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit); |
4901 | __ kmovql(k2, k3); |
4902 | __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit); |
4903 | __ kmovql(k2, k3); |
4904 | __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit); |
4905 | __ kmovql(k2, k3); |
4906 | __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); |
4907 | __ kmovql(k2, k3); |
4908 | __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit); |
4909 | |
4910 | //Down convert dword to byte. Final output is 16*6 = 96 bytes long |
4911 | __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit); |
4912 | __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit); |
4913 | __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit); |
4914 | __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit); |
4915 | __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit); |
4916 | __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit); |
4917 | |
4918 | __ addq(dest, 96); |
4919 | __ addq(source, 72); |
4920 | __ subq(length, 72); |
4921 | __ jmp(L_process80); |
4922 | |
4923 | // Vector Base64 implementation generating 32 bytes of encoded data |
4924 | __ BIND(L_process32); |
4925 | __ cmpl(length, 32); |
4926 | __ jcc(Assembler::below, L_process3); |
4927 | __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit); |
4928 | __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit); |
4929 | __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit); |
4930 | __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit); |
4931 | __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit); |
4932 | __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit); |
4933 | |
4934 | __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); |
4935 | __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); |
4936 | __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); |
4937 | __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit); |
4938 | __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); |
4939 | __ vextracti64x4(xmm9, xmm1, 0); |
4940 | __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit); |
4941 | __ vextracti64x4(xmm9, xmm1, 1); |
4942 | __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit); |
4943 | __ kmovql(k2, k3); |
4944 | __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); |
4945 | __ kmovql(k2, k3); |
4946 | __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit); |
4947 | __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit); |
4948 | __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit); |
4949 | __ subq(length, 24); |
4950 | __ addq(dest, 32); |
4951 | __ addq(source, 24); |
4952 | __ jmp(L_process32); |
4953 | |
4954 | // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data |
4955 | /* This code corresponds to the scalar version of the following snippet in Base64.java |
4956 | ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff); |
4957 | ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f]; |
4958 | ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f]; |
4959 | ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f]; |
4960 | ** dst[dp0++] = (byte)base64[bits & 0x3f];*/ |
4961 | __ BIND(L_process3); |
4962 | __ cmpl(length, 3); |
4963 | __ jcc(Assembler::below, L_exit); |
4964 | // Read 1 byte at a time |
4965 | __ movzbl(rax, Address(source, start_offset)); |
4966 | __ shll(rax, 0x10); |
4967 | __ movl(r15, rax); |
4968 | __ movzbl(rax, Address(source, start_offset, Address::times_1, 1)); |
4969 | __ shll(rax, 0x8); |
4970 | __ movzwl(rax, rax); |
4971 | __ orl(r15, rax); |
4972 | __ movzbl(rax, Address(source, start_offset, Address::times_1, 2)); |
4973 | __ orl(rax, r15); |
4974 | // Save 3 bytes read in r15 |
4975 | __ movl(r15, rax); |
4976 | __ shrl(rax, 0x12); |
4977 | __ andl(rax, 0x3f); |
4978 | // rax contains the index, r11 contains base64 lookup table |
4979 | __ movb(rax, Address(r11, rax, Address::times_4)); |
4980 | // Write the encoded byte to destination |
4981 | __ movb(Address(dest, dp, Address::times_1, 0), rax); |
4982 | __ movl(rax, r15); |
4983 | __ shrl(rax, 0xc); |
4984 | __ andl(rax, 0x3f); |
4985 | __ movb(rax, Address(r11, rax, Address::times_4)); |
4986 | __ movb(Address(dest, dp, Address::times_1, 1), rax); |
4987 | __ movl(rax, r15); |
4988 | __ shrl(rax, 0x6); |
4989 | __ andl(rax, 0x3f); |
4990 | __ movb(rax, Address(r11, rax, Address::times_4)); |
4991 | __ movb(Address(dest, dp, Address::times_1, 2), rax); |
4992 | __ movl(rax, r15); |
4993 | __ andl(rax, 0x3f); |
4994 | __ movb(rax, Address(r11, rax, Address::times_4)); |
4995 | __ movb(Address(dest, dp, Address::times_1, 3), rax); |
4996 | __ subl(length, 3); |
4997 | __ addq(dest, 4); |
4998 | __ addq(source, 3); |
4999 | __ jmp(L_process3); |
5000 | __ BIND(L_exit); |
5001 | __ pop(r15); |
5002 | __ pop(r14); |
5003 | __ pop(r13); |
5004 | __ pop(r12); |
5005 | __ leave(); |
5006 | __ ret(0); |
5007 | return start; |
5008 | } |
5009 | |
5010 | /** |
5011 | * Arguments: |
5012 | * |
5013 | * Inputs: |
5014 | * c_rarg0 - int crc |
5015 | * c_rarg1 - byte* buf |
5016 | * c_rarg2 - int length |
5017 | * |
5018 | * Ouput: |
5019 | * rax - int crc result |
5020 | */ |
5021 | address generate_updateBytesCRC32() { |
5022 | assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions" ); |
5023 | |
5024 | __ align(CodeEntryAlignment); |
5025 | StubCodeMark mark(this, "StubRoutines" , "updateBytesCRC32" ); |
5026 | |
5027 | address start = __ pc(); |
5028 | // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) |
5029 | // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) |
5030 | // rscratch1: r10 |
5031 | const Register crc = c_rarg0; // crc |
5032 | const Register buf = c_rarg1; // source java byte array address |
5033 | const Register len = c_rarg2; // length |
5034 | const Register table = c_rarg3; // crc_table address (reuse register) |
5035 | const Register tmp = r11; |
5036 | assert_different_registers(crc, buf, len, table, tmp, rax); |
5037 | |
5038 | BLOCK_COMMENT("Entry:" ); |
5039 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5040 | |
5041 | __ kernel_crc32(crc, buf, len, table, tmp); |
5042 | |
5043 | __ movl(rax, crc); |
5044 | __ vzeroupper(); |
5045 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5046 | __ ret(0); |
5047 | |
5048 | return start; |
5049 | } |
5050 | |
5051 | /** |
5052 | * Arguments: |
5053 | * |
5054 | * Inputs: |
5055 | * c_rarg0 - int crc |
5056 | * c_rarg1 - byte* buf |
5057 | * c_rarg2 - long length |
5058 | * c_rarg3 - table_start - optional (present only when doing a library_call, |
5059 | * not used by x86 algorithm) |
5060 | * |
5061 | * Ouput: |
5062 | * rax - int crc result |
5063 | */ |
5064 | address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { |
5065 | assert(UseCRC32CIntrinsics, "need SSE4_2" ); |
5066 | __ align(CodeEntryAlignment); |
5067 | StubCodeMark mark(this, "StubRoutines" , "updateBytesCRC32C" ); |
5068 | address start = __ pc(); |
5069 | //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs |
5070 | //Windows RCX RDX R8 R9 none none XMM0..XMM3 |
5071 | //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 |
5072 | const Register crc = c_rarg0; // crc |
5073 | const Register buf = c_rarg1; // source java byte array address |
5074 | const Register len = c_rarg2; // length |
5075 | const Register a = rax; |
5076 | const Register j = r9; |
5077 | const Register k = r10; |
5078 | const Register l = r11; |
5079 | #ifdef _WIN64 |
5080 | const Register y = rdi; |
5081 | const Register z = rsi; |
5082 | #else |
5083 | const Register y = rcx; |
5084 | const Register z = r8; |
5085 | #endif |
5086 | assert_different_registers(crc, buf, len, a, j, k, l, y, z); |
5087 | |
5088 | BLOCK_COMMENT("Entry:" ); |
5089 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5090 | #ifdef _WIN64 |
5091 | __ push(y); |
5092 | __ push(z); |
5093 | #endif |
5094 | __ crc32c_ipl_alg2_alt2(crc, buf, len, |
5095 | a, j, k, |
5096 | l, y, z, |
5097 | c_farg0, c_farg1, c_farg2, |
5098 | is_pclmulqdq_supported); |
5099 | __ movl(rax, crc); |
5100 | #ifdef _WIN64 |
5101 | __ pop(z); |
5102 | __ pop(y); |
5103 | #endif |
5104 | __ vzeroupper(); |
5105 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5106 | __ ret(0); |
5107 | |
5108 | return start; |
5109 | } |
5110 | |
5111 | /** |
5112 | * Arguments: |
5113 | * |
5114 | * Input: |
5115 | * c_rarg0 - x address |
5116 | * c_rarg1 - x length |
5117 | * c_rarg2 - y address |
5118 | * c_rarg3 - y length |
5119 | * not Win64 |
5120 | * c_rarg4 - z address |
5121 | * c_rarg5 - z length |
5122 | * Win64 |
5123 | * rsp+40 - z address |
5124 | * rsp+48 - z length |
5125 | */ |
5126 | address generate_multiplyToLen() { |
5127 | __ align(CodeEntryAlignment); |
5128 | StubCodeMark mark(this, "StubRoutines" , "multiplyToLen" ); |
5129 | |
5130 | address start = __ pc(); |
5131 | // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) |
5132 | // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) |
5133 | const Register x = rdi; |
5134 | const Register xlen = rax; |
5135 | const Register y = rsi; |
5136 | const Register ylen = rcx; |
5137 | const Register z = r8; |
5138 | const Register zlen = r11; |
5139 | |
5140 | // Next registers will be saved on stack in multiply_to_len(). |
5141 | const Register tmp1 = r12; |
5142 | const Register tmp2 = r13; |
5143 | const Register tmp3 = r14; |
5144 | const Register tmp4 = r15; |
5145 | const Register tmp5 = rbx; |
5146 | |
5147 | BLOCK_COMMENT("Entry:" ); |
5148 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5149 | |
5150 | #ifndef _WIN64 |
5151 | __ movptr(zlen, r9); // Save r9 in r11 - zlen |
5152 | #endif |
5153 | setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx |
5154 | // ylen => rcx, z => r8, zlen => r11 |
5155 | // r9 and r10 may be used to save non-volatile registers |
5156 | #ifdef _WIN64 |
5157 | // last 2 arguments (#4, #5) are on stack on Win64 |
5158 | __ movptr(z, Address(rsp, 6 * wordSize)); |
5159 | __ movptr(zlen, Address(rsp, 7 * wordSize)); |
5160 | #endif |
5161 | |
5162 | __ movptr(xlen, rsi); |
5163 | __ movptr(y, rdx); |
5164 | __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5); |
5165 | |
5166 | restore_arg_regs(); |
5167 | |
5168 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5169 | __ ret(0); |
5170 | |
5171 | return start; |
5172 | } |
5173 | |
5174 | /** |
5175 | * Arguments: |
5176 | * |
5177 | * Input: |
5178 | * c_rarg0 - obja address |
5179 | * c_rarg1 - objb address |
5180 | * c_rarg3 - length length |
5181 | * c_rarg4 - scale log2_array_indxscale |
5182 | * |
5183 | * Output: |
5184 | * rax - int >= mismatched index, < 0 bitwise complement of tail |
5185 | */ |
5186 | address generate_vectorizedMismatch() { |
5187 | __ align(CodeEntryAlignment); |
5188 | StubCodeMark mark(this, "StubRoutines" , "vectorizedMismatch" ); |
5189 | address start = __ pc(); |
5190 | |
5191 | BLOCK_COMMENT("Entry:" ); |
5192 | __ enter(); |
5193 | |
5194 | #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) |
5195 | const Register scale = c_rarg0; //rcx, will exchange with r9 |
5196 | const Register objb = c_rarg1; //rdx |
5197 | const Register length = c_rarg2; //r8 |
5198 | const Register obja = c_rarg3; //r9 |
5199 | __ xchgq(obja, scale); //now obja and scale contains the correct contents |
5200 | |
5201 | const Register tmp1 = r10; |
5202 | const Register tmp2 = r11; |
5203 | #endif |
5204 | #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) |
5205 | const Register obja = c_rarg0; //U:rdi |
5206 | const Register objb = c_rarg1; //U:rsi |
5207 | const Register length = c_rarg2; //U:rdx |
5208 | const Register scale = c_rarg3; //U:rcx |
5209 | const Register tmp1 = r8; |
5210 | const Register tmp2 = r9; |
5211 | #endif |
5212 | const Register result = rax; //return value |
5213 | const XMMRegister vec0 = xmm0; |
5214 | const XMMRegister vec1 = xmm1; |
5215 | const XMMRegister vec2 = xmm2; |
5216 | |
5217 | __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); |
5218 | |
5219 | __ vzeroupper(); |
5220 | __ leave(); |
5221 | __ ret(0); |
5222 | |
5223 | return start; |
5224 | } |
5225 | |
5226 | /** |
5227 | * Arguments: |
5228 | * |
5229 | // Input: |
5230 | // c_rarg0 - x address |
5231 | // c_rarg1 - x length |
5232 | // c_rarg2 - z address |
5233 | // c_rarg3 - z lenth |
5234 | * |
5235 | */ |
5236 | address generate_squareToLen() { |
5237 | |
5238 | __ align(CodeEntryAlignment); |
5239 | StubCodeMark mark(this, "StubRoutines" , "squareToLen" ); |
5240 | |
5241 | address start = __ pc(); |
5242 | // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) |
5243 | // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) |
5244 | const Register x = rdi; |
5245 | const Register len = rsi; |
5246 | const Register z = r8; |
5247 | const Register zlen = rcx; |
5248 | |
5249 | const Register tmp1 = r12; |
5250 | const Register tmp2 = r13; |
5251 | const Register tmp3 = r14; |
5252 | const Register tmp4 = r15; |
5253 | const Register tmp5 = rbx; |
5254 | |
5255 | BLOCK_COMMENT("Entry:" ); |
5256 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5257 | |
5258 | setup_arg_regs(4); // x => rdi, len => rsi, z => rdx |
5259 | // zlen => rcx |
5260 | // r9 and r10 may be used to save non-volatile registers |
5261 | __ movptr(r8, rdx); |
5262 | __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); |
5263 | |
5264 | restore_arg_regs(); |
5265 | |
5266 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5267 | __ ret(0); |
5268 | |
5269 | return start; |
5270 | } |
5271 | |
5272 | address generate_method_entry_barrier() { |
5273 | __ align(CodeEntryAlignment); |
5274 | StubCodeMark mark(this, "StubRoutines" , "nmethod_entry_barrier" ); |
5275 | |
5276 | Label deoptimize_label; |
5277 | |
5278 | address start = __ pc(); |
5279 | |
5280 | __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing |
5281 | |
5282 | BLOCK_COMMENT("Entry:" ); |
5283 | __ enter(); // save rbp |
5284 | |
5285 | // save c_rarg0, because we want to use that value. |
5286 | // We could do without it but then we depend on the number of slots used by pusha |
5287 | __ push(c_rarg0); |
5288 | |
5289 | __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address |
5290 | |
5291 | __ pusha(); |
5292 | |
5293 | // The method may have floats as arguments, and we must spill them before calling |
5294 | // the VM runtime. |
5295 | assert(Argument::n_float_register_parameters_j == 8, "Assumption" ); |
5296 | const int xmm_size = wordSize * 2; |
5297 | const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j; |
5298 | __ subptr(rsp, xmm_spill_size); |
5299 | __ movdqu(Address(rsp, xmm_size * 7), xmm7); |
5300 | __ movdqu(Address(rsp, xmm_size * 6), xmm6); |
5301 | __ movdqu(Address(rsp, xmm_size * 5), xmm5); |
5302 | __ movdqu(Address(rsp, xmm_size * 4), xmm4); |
5303 | __ movdqu(Address(rsp, xmm_size * 3), xmm3); |
5304 | __ movdqu(Address(rsp, xmm_size * 2), xmm2); |
5305 | __ movdqu(Address(rsp, xmm_size * 1), xmm1); |
5306 | __ movdqu(Address(rsp, xmm_size * 0), xmm0); |
5307 | |
5308 | __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1); |
5309 | |
5310 | __ movdqu(xmm0, Address(rsp, xmm_size * 0)); |
5311 | __ movdqu(xmm1, Address(rsp, xmm_size * 1)); |
5312 | __ movdqu(xmm2, Address(rsp, xmm_size * 2)); |
5313 | __ movdqu(xmm3, Address(rsp, xmm_size * 3)); |
5314 | __ movdqu(xmm4, Address(rsp, xmm_size * 4)); |
5315 | __ movdqu(xmm5, Address(rsp, xmm_size * 5)); |
5316 | __ movdqu(xmm6, Address(rsp, xmm_size * 6)); |
5317 | __ movdqu(xmm7, Address(rsp, xmm_size * 7)); |
5318 | __ addptr(rsp, xmm_spill_size); |
5319 | |
5320 | __ cmpl(rax, 1); // 1 means deoptimize |
5321 | __ jcc(Assembler::equal, deoptimize_label); |
5322 | |
5323 | __ popa(); |
5324 | __ pop(c_rarg0); |
5325 | |
5326 | __ leave(); |
5327 | |
5328 | __ addptr(rsp, 1 * wordSize); // cookie |
5329 | __ ret(0); |
5330 | |
5331 | |
5332 | __ BIND(deoptimize_label); |
5333 | |
5334 | __ popa(); |
5335 | __ pop(c_rarg0); |
5336 | |
5337 | __ leave(); |
5338 | |
5339 | // this can be taken out, but is good for verification purposes. getting a SIGSEGV |
5340 | // here while still having a correct stack is valuable |
5341 | __ testptr(rsp, Address(rsp, 0)); |
5342 | |
5343 | __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier |
5344 | __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point |
5345 | |
5346 | return start; |
5347 | } |
5348 | |
5349 | /** |
5350 | * Arguments: |
5351 | * |
5352 | * Input: |
5353 | * c_rarg0 - out address |
5354 | * c_rarg1 - in address |
5355 | * c_rarg2 - offset |
5356 | * c_rarg3 - len |
5357 | * not Win64 |
5358 | * c_rarg4 - k |
5359 | * Win64 |
5360 | * rsp+40 - k |
5361 | */ |
5362 | address generate_mulAdd() { |
5363 | __ align(CodeEntryAlignment); |
5364 | StubCodeMark mark(this, "StubRoutines" , "mulAdd" ); |
5365 | |
5366 | address start = __ pc(); |
5367 | // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) |
5368 | // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) |
5369 | const Register out = rdi; |
5370 | const Register in = rsi; |
5371 | const Register offset = r11; |
5372 | const Register len = rcx; |
5373 | const Register k = r8; |
5374 | |
5375 | // Next registers will be saved on stack in mul_add(). |
5376 | const Register tmp1 = r12; |
5377 | const Register tmp2 = r13; |
5378 | const Register tmp3 = r14; |
5379 | const Register tmp4 = r15; |
5380 | const Register tmp5 = rbx; |
5381 | |
5382 | BLOCK_COMMENT("Entry:" ); |
5383 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5384 | |
5385 | setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx |
5386 | // len => rcx, k => r8 |
5387 | // r9 and r10 may be used to save non-volatile registers |
5388 | #ifdef _WIN64 |
5389 | // last argument is on stack on Win64 |
5390 | __ movl(k, Address(rsp, 6 * wordSize)); |
5391 | #endif |
5392 | __ movptr(r11, rdx); // move offset in rdx to offset(r11) |
5393 | __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); |
5394 | |
5395 | restore_arg_regs(); |
5396 | |
5397 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5398 | __ ret(0); |
5399 | |
5400 | return start; |
5401 | } |
5402 | |
5403 | address generate_libmExp() { |
5404 | StubCodeMark mark(this, "StubRoutines" , "libmExp" ); |
5405 | |
5406 | address start = __ pc(); |
5407 | |
5408 | const XMMRegister x0 = xmm0; |
5409 | const XMMRegister x1 = xmm1; |
5410 | const XMMRegister x2 = xmm2; |
5411 | const XMMRegister x3 = xmm3; |
5412 | |
5413 | const XMMRegister x4 = xmm4; |
5414 | const XMMRegister x5 = xmm5; |
5415 | const XMMRegister x6 = xmm6; |
5416 | const XMMRegister x7 = xmm7; |
5417 | |
5418 | const Register tmp = r11; |
5419 | |
5420 | BLOCK_COMMENT("Entry:" ); |
5421 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5422 | |
5423 | __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); |
5424 | |
5425 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5426 | __ ret(0); |
5427 | |
5428 | return start; |
5429 | |
5430 | } |
5431 | |
5432 | address generate_libmLog() { |
5433 | StubCodeMark mark(this, "StubRoutines" , "libmLog" ); |
5434 | |
5435 | address start = __ pc(); |
5436 | |
5437 | const XMMRegister x0 = xmm0; |
5438 | const XMMRegister x1 = xmm1; |
5439 | const XMMRegister x2 = xmm2; |
5440 | const XMMRegister x3 = xmm3; |
5441 | |
5442 | const XMMRegister x4 = xmm4; |
5443 | const XMMRegister x5 = xmm5; |
5444 | const XMMRegister x6 = xmm6; |
5445 | const XMMRegister x7 = xmm7; |
5446 | |
5447 | const Register tmp1 = r11; |
5448 | const Register tmp2 = r8; |
5449 | |
5450 | BLOCK_COMMENT("Entry:" ); |
5451 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5452 | |
5453 | __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2); |
5454 | |
5455 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5456 | __ ret(0); |
5457 | |
5458 | return start; |
5459 | |
5460 | } |
5461 | |
5462 | address generate_libmLog10() { |
5463 | StubCodeMark mark(this, "StubRoutines" , "libmLog10" ); |
5464 | |
5465 | address start = __ pc(); |
5466 | |
5467 | const XMMRegister x0 = xmm0; |
5468 | const XMMRegister x1 = xmm1; |
5469 | const XMMRegister x2 = xmm2; |
5470 | const XMMRegister x3 = xmm3; |
5471 | |
5472 | const XMMRegister x4 = xmm4; |
5473 | const XMMRegister x5 = xmm5; |
5474 | const XMMRegister x6 = xmm6; |
5475 | const XMMRegister x7 = xmm7; |
5476 | |
5477 | const Register tmp = r11; |
5478 | |
5479 | BLOCK_COMMENT("Entry:" ); |
5480 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5481 | |
5482 | __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); |
5483 | |
5484 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5485 | __ ret(0); |
5486 | |
5487 | return start; |
5488 | |
5489 | } |
5490 | |
5491 | address generate_libmPow() { |
5492 | StubCodeMark mark(this, "StubRoutines" , "libmPow" ); |
5493 | |
5494 | address start = __ pc(); |
5495 | |
5496 | const XMMRegister x0 = xmm0; |
5497 | const XMMRegister x1 = xmm1; |
5498 | const XMMRegister x2 = xmm2; |
5499 | const XMMRegister x3 = xmm3; |
5500 | |
5501 | const XMMRegister x4 = xmm4; |
5502 | const XMMRegister x5 = xmm5; |
5503 | const XMMRegister x6 = xmm6; |
5504 | const XMMRegister x7 = xmm7; |
5505 | |
5506 | const Register tmp1 = r8; |
5507 | const Register tmp2 = r9; |
5508 | const Register tmp3 = r10; |
5509 | const Register tmp4 = r11; |
5510 | |
5511 | BLOCK_COMMENT("Entry:" ); |
5512 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5513 | |
5514 | __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); |
5515 | |
5516 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5517 | __ ret(0); |
5518 | |
5519 | return start; |
5520 | |
5521 | } |
5522 | |
5523 | address generate_libmSin() { |
5524 | StubCodeMark mark(this, "StubRoutines" , "libmSin" ); |
5525 | |
5526 | address start = __ pc(); |
5527 | |
5528 | const XMMRegister x0 = xmm0; |
5529 | const XMMRegister x1 = xmm1; |
5530 | const XMMRegister x2 = xmm2; |
5531 | const XMMRegister x3 = xmm3; |
5532 | |
5533 | const XMMRegister x4 = xmm4; |
5534 | const XMMRegister x5 = xmm5; |
5535 | const XMMRegister x6 = xmm6; |
5536 | const XMMRegister x7 = xmm7; |
5537 | |
5538 | const Register tmp1 = r8; |
5539 | const Register tmp2 = r9; |
5540 | const Register tmp3 = r10; |
5541 | const Register tmp4 = r11; |
5542 | |
5543 | BLOCK_COMMENT("Entry:" ); |
5544 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5545 | |
5546 | #ifdef _WIN64 |
5547 | __ push(rsi); |
5548 | __ push(rdi); |
5549 | #endif |
5550 | __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4); |
5551 | |
5552 | #ifdef _WIN64 |
5553 | __ pop(rdi); |
5554 | __ pop(rsi); |
5555 | #endif |
5556 | |
5557 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5558 | __ ret(0); |
5559 | |
5560 | return start; |
5561 | |
5562 | } |
5563 | |
5564 | address generate_libmCos() { |
5565 | StubCodeMark mark(this, "StubRoutines" , "libmCos" ); |
5566 | |
5567 | address start = __ pc(); |
5568 | |
5569 | const XMMRegister x0 = xmm0; |
5570 | const XMMRegister x1 = xmm1; |
5571 | const XMMRegister x2 = xmm2; |
5572 | const XMMRegister x3 = xmm3; |
5573 | |
5574 | const XMMRegister x4 = xmm4; |
5575 | const XMMRegister x5 = xmm5; |
5576 | const XMMRegister x6 = xmm6; |
5577 | const XMMRegister x7 = xmm7; |
5578 | |
5579 | const Register tmp1 = r8; |
5580 | const Register tmp2 = r9; |
5581 | const Register tmp3 = r10; |
5582 | const Register tmp4 = r11; |
5583 | |
5584 | BLOCK_COMMENT("Entry:" ); |
5585 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5586 | |
5587 | #ifdef _WIN64 |
5588 | __ push(rsi); |
5589 | __ push(rdi); |
5590 | #endif |
5591 | __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); |
5592 | |
5593 | #ifdef _WIN64 |
5594 | __ pop(rdi); |
5595 | __ pop(rsi); |
5596 | #endif |
5597 | |
5598 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5599 | __ ret(0); |
5600 | |
5601 | return start; |
5602 | |
5603 | } |
5604 | |
5605 | address generate_libmTan() { |
5606 | StubCodeMark mark(this, "StubRoutines" , "libmTan" ); |
5607 | |
5608 | address start = __ pc(); |
5609 | |
5610 | const XMMRegister x0 = xmm0; |
5611 | const XMMRegister x1 = xmm1; |
5612 | const XMMRegister x2 = xmm2; |
5613 | const XMMRegister x3 = xmm3; |
5614 | |
5615 | const XMMRegister x4 = xmm4; |
5616 | const XMMRegister x5 = xmm5; |
5617 | const XMMRegister x6 = xmm6; |
5618 | const XMMRegister x7 = xmm7; |
5619 | |
5620 | const Register tmp1 = r8; |
5621 | const Register tmp2 = r9; |
5622 | const Register tmp3 = r10; |
5623 | const Register tmp4 = r11; |
5624 | |
5625 | BLOCK_COMMENT("Entry:" ); |
5626 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5627 | |
5628 | #ifdef _WIN64 |
5629 | __ push(rsi); |
5630 | __ push(rdi); |
5631 | #endif |
5632 | __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); |
5633 | |
5634 | #ifdef _WIN64 |
5635 | __ pop(rdi); |
5636 | __ pop(rsi); |
5637 | #endif |
5638 | |
5639 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5640 | __ ret(0); |
5641 | |
5642 | return start; |
5643 | |
5644 | } |
5645 | |
5646 | #undef __ |
5647 | #define __ masm-> |
5648 | |
5649 | // Continuation point for throwing of implicit exceptions that are |
5650 | // not handled in the current activation. Fabricates an exception |
5651 | // oop and initiates normal exception dispatching in this |
5652 | // frame. Since we need to preserve callee-saved values (currently |
5653 | // only for C2, but done for C1 as well) we need a callee-saved oop |
5654 | // map and therefore have to make these stubs into RuntimeStubs |
5655 | // rather than BufferBlobs. If the compiler needs all registers to |
5656 | // be preserved between the fault point and the exception handler |
5657 | // then it must assume responsibility for that in |
5658 | // AbstractCompiler::continuation_for_implicit_null_exception or |
5659 | // continuation_for_implicit_division_by_zero_exception. All other |
5660 | // implicit exceptions (e.g., NullPointerException or |
5661 | // AbstractMethodError on entry) are either at call sites or |
5662 | // otherwise assume that stack unwinding will be initiated, so |
5663 | // caller saved registers were assumed volatile in the compiler. |
5664 | address generate_throw_exception(const char* name, |
5665 | address runtime_entry, |
5666 | Register arg1 = noreg, |
5667 | Register arg2 = noreg) { |
5668 | // Information about frame layout at time of blocking runtime call. |
5669 | // Note that we only have to preserve callee-saved registers since |
5670 | // the compilers are responsible for supplying a continuation point |
5671 | // if they expect all registers to be preserved. |
5672 | enum layout { |
5673 | rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, |
5674 | rbp_off2, |
5675 | return_off, |
5676 | return_off2, |
5677 | framesize // inclusive of return address |
5678 | }; |
5679 | |
5680 | int insts_size = 512; |
5681 | int locs_size = 64; |
5682 | |
5683 | CodeBuffer code(name, insts_size, locs_size); |
5684 | OopMapSet* oop_maps = new OopMapSet(); |
5685 | MacroAssembler* masm = new MacroAssembler(&code); |
5686 | |
5687 | address start = __ pc(); |
5688 | |
5689 | // This is an inlined and slightly modified version of call_VM |
5690 | // which has the ability to fetch the return PC out of |
5691 | // thread-local storage and also sets up last_Java_sp slightly |
5692 | // differently than the real call_VM |
5693 | |
5694 | __ enter(); // required for proper stackwalking of RuntimeStub frame |
5695 | |
5696 | assert(is_even(framesize/2), "sp not 16-byte aligned" ); |
5697 | |
5698 | // return address and rbp are already in place |
5699 | __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog |
5700 | |
5701 | int frame_complete = __ pc() - start; |
5702 | |
5703 | // Set up last_Java_sp and last_Java_fp |
5704 | address the_pc = __ pc(); |
5705 | __ set_last_Java_frame(rsp, rbp, the_pc); |
5706 | __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack |
5707 | |
5708 | // Call runtime |
5709 | if (arg1 != noreg) { |
5710 | assert(arg2 != c_rarg1, "clobbered" ); |
5711 | __ movptr(c_rarg1, arg1); |
5712 | } |
5713 | if (arg2 != noreg) { |
5714 | __ movptr(c_rarg2, arg2); |
5715 | } |
5716 | __ movptr(c_rarg0, r15_thread); |
5717 | BLOCK_COMMENT("call runtime_entry" ); |
5718 | __ call(RuntimeAddress(runtime_entry)); |
5719 | |
5720 | // Generate oop map |
5721 | OopMap* map = new OopMap(framesize, 0); |
5722 | |
5723 | oop_maps->add_gc_map(the_pc - start, map); |
5724 | |
5725 | __ reset_last_Java_frame(true); |
5726 | |
5727 | __ leave(); // required for proper stackwalking of RuntimeStub frame |
5728 | |
5729 | // check for pending exceptions |
5730 | #ifdef ASSERT |
5731 | Label L; |
5732 | __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), |
5733 | (int32_t) NULL_WORD); |
5734 | __ jcc(Assembler::notEqual, L); |
5735 | __ should_not_reach_here(); |
5736 | __ bind(L); |
5737 | #endif // ASSERT |
5738 | __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); |
5739 | |
5740 | |
5741 | // codeBlob framesize is in words (not VMRegImpl::slot_size) |
5742 | RuntimeStub* stub = |
5743 | RuntimeStub::new_runtime_stub(name, |
5744 | &code, |
5745 | frame_complete, |
5746 | (framesize >> (LogBytesPerWord - LogBytesPerInt)), |
5747 | oop_maps, false); |
5748 | return stub->entry_point(); |
5749 | } |
5750 | |
5751 | void create_control_words() { |
5752 | // Round to nearest, 53-bit mode, exceptions masked |
5753 | StubRoutines::_fpu_cntrl_wrd_std = 0x027F; |
5754 | // Round to zero, 53-bit mode, exception mased |
5755 | StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F; |
5756 | // Round to nearest, 24-bit mode, exceptions masked |
5757 | StubRoutines::_fpu_cntrl_wrd_24 = 0x007F; |
5758 | // Round to nearest, 64-bit mode, exceptions masked |
5759 | StubRoutines::_mxcsr_std = 0x1F80; |
5760 | // Note: the following two constants are 80-bit values |
5761 | // layout is critical for correct loading by FPU. |
5762 | // Bias for strict fp multiply/divide |
5763 | StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000 |
5764 | StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000; |
5765 | StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff; |
5766 | // Un-Bias for strict fp multiply/divide |
5767 | StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000 |
5768 | StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000; |
5769 | StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff; |
5770 | } |
5771 | |
5772 | // Initialization |
5773 | void generate_initial() { |
5774 | // Generates all stubs and initializes the entry points |
5775 | |
5776 | // This platform-specific settings are needed by generate_call_stub() |
5777 | create_control_words(); |
5778 | |
5779 | // entry points that exist in all platforms Note: This is code |
5780 | // that could be shared among different platforms - however the |
5781 | // benefit seems to be smaller than the disadvantage of having a |
5782 | // much more complicated generator structure. See also comment in |
5783 | // stubRoutines.hpp. |
5784 | |
5785 | StubRoutines::_forward_exception_entry = generate_forward_exception(); |
5786 | |
5787 | StubRoutines::_call_stub_entry = |
5788 | generate_call_stub(StubRoutines::_call_stub_return_address); |
5789 | |
5790 | // is referenced by megamorphic call |
5791 | StubRoutines::_catch_exception_entry = generate_catch_exception(); |
5792 | |
5793 | // atomic calls |
5794 | StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); |
5795 | StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long(); |
5796 | StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); |
5797 | StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte(); |
5798 | StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); |
5799 | StubRoutines::_atomic_add_entry = generate_atomic_add(); |
5800 | StubRoutines::_atomic_add_long_entry = generate_atomic_add_long(); |
5801 | StubRoutines::_fence_entry = generate_orderaccess_fence(); |
5802 | |
5803 | // platform dependent |
5804 | StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp(); |
5805 | StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp(); |
5806 | |
5807 | StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr(); |
5808 | |
5809 | // Build this early so it's available for the interpreter. |
5810 | StubRoutines::_throw_StackOverflowError_entry = |
5811 | generate_throw_exception("StackOverflowError throw_exception" , |
5812 | CAST_FROM_FN_PTR(address, |
5813 | SharedRuntime:: |
5814 | throw_StackOverflowError)); |
5815 | StubRoutines::_throw_delayed_StackOverflowError_entry = |
5816 | generate_throw_exception("delayed StackOverflowError throw_exception" , |
5817 | CAST_FROM_FN_PTR(address, |
5818 | SharedRuntime:: |
5819 | throw_delayed_StackOverflowError)); |
5820 | if (UseCRC32Intrinsics) { |
5821 | // set table address before stub generation which use it |
5822 | StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; |
5823 | StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); |
5824 | } |
5825 | |
5826 | if (UseCRC32CIntrinsics) { |
5827 | bool supports_clmul = VM_Version::supports_clmul(); |
5828 | StubRoutines::x86::generate_CRC32C_table(supports_clmul); |
5829 | StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; |
5830 | StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); |
5831 | } |
5832 | if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) { |
5833 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || |
5834 | vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || |
5835 | vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { |
5836 | StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF; |
5837 | StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2; |
5838 | StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4; |
5839 | StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable; |
5840 | StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2; |
5841 | StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3; |
5842 | StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1; |
5843 | StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE; |
5844 | StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4; |
5845 | StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV; |
5846 | StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK; |
5847 | StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1; |
5848 | StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3; |
5849 | StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO; |
5850 | } |
5851 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) { |
5852 | StubRoutines::_dexp = generate_libmExp(); |
5853 | } |
5854 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { |
5855 | StubRoutines::_dlog = generate_libmLog(); |
5856 | } |
5857 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) { |
5858 | StubRoutines::_dlog10 = generate_libmLog10(); |
5859 | } |
5860 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) { |
5861 | StubRoutines::_dpow = generate_libmPow(); |
5862 | } |
5863 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { |
5864 | StubRoutines::_dsin = generate_libmSin(); |
5865 | } |
5866 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { |
5867 | StubRoutines::_dcos = generate_libmCos(); |
5868 | } |
5869 | if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { |
5870 | StubRoutines::_dtan = generate_libmTan(); |
5871 | } |
5872 | } |
5873 | } |
5874 | |
5875 | void generate_all() { |
5876 | // Generates all stubs and initializes the entry points |
5877 | |
5878 | // These entry points require SharedInfo::stack0 to be set up in |
5879 | // non-core builds and need to be relocatable, so they each |
5880 | // fabricate a RuntimeStub internally. |
5881 | StubRoutines::_throw_AbstractMethodError_entry = |
5882 | generate_throw_exception("AbstractMethodError throw_exception" , |
5883 | CAST_FROM_FN_PTR(address, |
5884 | SharedRuntime:: |
5885 | throw_AbstractMethodError)); |
5886 | |
5887 | StubRoutines::_throw_IncompatibleClassChangeError_entry = |
5888 | generate_throw_exception("IncompatibleClassChangeError throw_exception" , |
5889 | CAST_FROM_FN_PTR(address, |
5890 | SharedRuntime:: |
5891 | throw_IncompatibleClassChangeError)); |
5892 | |
5893 | StubRoutines::_throw_NullPointerException_at_call_entry = |
5894 | generate_throw_exception("NullPointerException at call throw_exception" , |
5895 | CAST_FROM_FN_PTR(address, |
5896 | SharedRuntime:: |
5897 | throw_NullPointerException_at_call)); |
5898 | |
5899 | // entry points that are platform specific |
5900 | StubRoutines::x86::_f2i_fixup = generate_f2i_fixup(); |
5901 | StubRoutines::x86::_f2l_fixup = generate_f2l_fixup(); |
5902 | StubRoutines::x86::_d2i_fixup = generate_d2i_fixup(); |
5903 | StubRoutines::x86::_d2l_fixup = generate_d2l_fixup(); |
5904 | |
5905 | StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask" , 0x7FFFFFFF7FFFFFFF); |
5906 | StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip" , 0x8000000080000000); |
5907 | StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask" , 0x7FFFFFFFFFFFFFFF); |
5908 | StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip" , 0x8000000000000000); |
5909 | StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask" , 0x7FFFFFFF7FFFFFFF); |
5910 | StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip" , 0x8000000080000000); |
5911 | StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask" , 0x7FFFFFFFFFFFFFFF); |
5912 | StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip" , 0x8000000000000000); |
5913 | StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask" , 0x00ff00ff00ff00ff); |
5914 | StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask" ); |
5915 | StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask" , 0x8000000000000000); |
5916 | |
5917 | // support for verify_oop (must happen after universe_init) |
5918 | StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); |
5919 | |
5920 | // arraycopy stubs used by compilers |
5921 | generate_arraycopy_stubs(); |
5922 | |
5923 | // don't bother generating these AES intrinsic stubs unless global flag is set |
5924 | if (UseAESIntrinsics) { |
5925 | StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others |
5926 | StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); |
5927 | StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); |
5928 | StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); |
5929 | if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) { |
5930 | StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt(); |
5931 | } else { |
5932 | StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); |
5933 | } |
5934 | } |
5935 | if (UseAESCTRIntrinsics){ |
5936 | StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); |
5937 | StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); |
5938 | } |
5939 | |
5940 | if (UseSHA1Intrinsics) { |
5941 | StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); |
5942 | StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); |
5943 | StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress" ); |
5944 | StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB" ); |
5945 | } |
5946 | if (UseSHA256Intrinsics) { |
5947 | StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; |
5948 | char* dst = (char*)StubRoutines::x86::_k256_W; |
5949 | char* src = (char*)StubRoutines::x86::_k256; |
5950 | for (int ii = 0; ii < 16; ++ii) { |
5951 | memcpy(dst + 32 * ii, src + 16 * ii, 16); |
5952 | memcpy(dst + 32 * ii + 16, src + 16 * ii, 16); |
5953 | } |
5954 | StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W; |
5955 | StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); |
5956 | StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress" ); |
5957 | StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB" ); |
5958 | } |
5959 | if (UseSHA512Intrinsics) { |
5960 | StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W; |
5961 | StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512(); |
5962 | StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress" ); |
5963 | StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB" ); |
5964 | } |
5965 | |
5966 | // Generate GHASH intrinsics code |
5967 | if (UseGHASHIntrinsics) { |
5968 | StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); |
5969 | StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); |
5970 | if (VM_Version::supports_avx()) { |
5971 | StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr(); |
5972 | StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr(); |
5973 | StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks(); |
5974 | } else { |
5975 | StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); |
5976 | } |
5977 | } |
5978 | |
5979 | if (UseBASE64Intrinsics) { |
5980 | StubRoutines::x86::_and_mask = base64_and_mask_addr(); |
5981 | StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr(); |
5982 | StubRoutines::x86::_base64_charset = base64_charset_addr(); |
5983 | StubRoutines::x86::_url_charset = base64url_charset_addr(); |
5984 | StubRoutines::x86::_gather_mask = base64_gather_mask_addr(); |
5985 | StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr(); |
5986 | StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr(); |
5987 | StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); |
5988 | } |
5989 | |
5990 | // Safefetch stubs. |
5991 | generate_safefetch("SafeFetch32" , sizeof(int), &StubRoutines::_safefetch32_entry, |
5992 | &StubRoutines::_safefetch32_fault_pc, |
5993 | &StubRoutines::_safefetch32_continuation_pc); |
5994 | generate_safefetch("SafeFetchN" , sizeof(intptr_t), &StubRoutines::_safefetchN_entry, |
5995 | &StubRoutines::_safefetchN_fault_pc, |
5996 | &StubRoutines::_safefetchN_continuation_pc); |
5997 | |
5998 | BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); |
5999 | if (bs_nm != NULL) { |
6000 | StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier(); |
6001 | } |
6002 | #ifdef COMPILER2 |
6003 | if (UseMultiplyToLenIntrinsic) { |
6004 | StubRoutines::_multiplyToLen = generate_multiplyToLen(); |
6005 | } |
6006 | if (UseSquareToLenIntrinsic) { |
6007 | StubRoutines::_squareToLen = generate_squareToLen(); |
6008 | } |
6009 | if (UseMulAddIntrinsic) { |
6010 | StubRoutines::_mulAdd = generate_mulAdd(); |
6011 | } |
6012 | #ifndef _WINDOWS |
6013 | if (UseMontgomeryMultiplyIntrinsic) { |
6014 | StubRoutines::_montgomeryMultiply |
6015 | = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); |
6016 | } |
6017 | if (UseMontgomerySquareIntrinsic) { |
6018 | StubRoutines::_montgomerySquare |
6019 | = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); |
6020 | } |
6021 | #endif // WINDOWS |
6022 | #endif // COMPILER2 |
6023 | |
6024 | if (UseVectorizedMismatchIntrinsic) { |
6025 | StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); |
6026 | } |
6027 | } |
6028 | |
6029 | public: |
6030 | StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { |
6031 | if (all) { |
6032 | generate_all(); |
6033 | } else { |
6034 | generate_initial(); |
6035 | } |
6036 | } |
6037 | }; // end class declaration |
6038 | |
6039 | void StubGenerator_generate(CodeBuffer* code, bool all) { |
6040 | StubGenerator g(code, all); |
6041 | } |
6042 | |