1 | /* |
2 | * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #include "precompiled.hpp" |
26 | #include "jvm.h" |
27 | #include "asm/assembler.hpp" |
28 | #include "asm/assembler.inline.hpp" |
29 | #include "compiler/disassembler.hpp" |
30 | #include "gc/shared/barrierSet.hpp" |
31 | #include "gc/shared/barrierSetAssembler.hpp" |
32 | #include "gc/shared/collectedHeap.inline.hpp" |
33 | #include "interpreter/interpreter.hpp" |
34 | #include "memory/resourceArea.hpp" |
35 | #include "memory/universe.hpp" |
36 | #include "oops/accessDecorators.hpp" |
37 | #include "oops/compressedOops.inline.hpp" |
38 | #include "oops/klass.inline.hpp" |
39 | #include "prims/methodHandles.hpp" |
40 | #include "runtime/biasedLocking.hpp" |
41 | #include "runtime/flags/flagSetting.hpp" |
42 | #include "runtime/interfaceSupport.inline.hpp" |
43 | #include "runtime/objectMonitor.hpp" |
44 | #include "runtime/os.hpp" |
45 | #include "runtime/safepoint.hpp" |
46 | #include "runtime/safepointMechanism.hpp" |
47 | #include "runtime/sharedRuntime.hpp" |
48 | #include "runtime/stubRoutines.hpp" |
49 | #include "runtime/thread.hpp" |
50 | #include "utilities/macros.hpp" |
51 | #include "crc32c.h" |
52 | #ifdef COMPILER2 |
53 | #include "opto/intrinsicnode.hpp" |
54 | #endif |
55 | |
56 | #ifdef PRODUCT |
57 | #define (str) /* nothing */ |
58 | #define STOP(error) stop(error) |
59 | #else |
60 | #define BLOCK_COMMENT(str) block_comment(str) |
61 | #define STOP(error) block_comment(error); stop(error) |
62 | #endif |
63 | |
64 | #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") |
65 | |
66 | #ifdef ASSERT |
67 | bool AbstractAssembler::pd_check_instruction_mark() { return true; } |
68 | #endif |
69 | |
70 | static Assembler::Condition reverse[] = { |
71 | Assembler::noOverflow /* overflow = 0x0 */ , |
72 | Assembler::overflow /* noOverflow = 0x1 */ , |
73 | Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , |
74 | Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , |
75 | Assembler::notZero /* zero = 0x4, equal = 0x4 */ , |
76 | Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , |
77 | Assembler::above /* belowEqual = 0x6 */ , |
78 | Assembler::belowEqual /* above = 0x7 */ , |
79 | Assembler::positive /* negative = 0x8 */ , |
80 | Assembler::negative /* positive = 0x9 */ , |
81 | Assembler::noParity /* parity = 0xa */ , |
82 | Assembler::parity /* noParity = 0xb */ , |
83 | Assembler::greaterEqual /* less = 0xc */ , |
84 | Assembler::less /* greaterEqual = 0xd */ , |
85 | Assembler::greater /* lessEqual = 0xe */ , |
86 | Assembler::lessEqual /* greater = 0xf, */ |
87 | |
88 | }; |
89 | |
90 | |
91 | // Implementation of MacroAssembler |
92 | |
93 | // First all the versions that have distinct versions depending on 32/64 bit |
94 | // Unless the difference is trivial (1 line or so). |
95 | |
96 | #ifndef _LP64 |
97 | |
98 | // 32bit versions |
99 | |
100 | Address MacroAssembler::as_Address(AddressLiteral adr) { |
101 | return Address(adr.target(), adr.rspec()); |
102 | } |
103 | |
104 | Address MacroAssembler::as_Address(ArrayAddress adr) { |
105 | return Address::make_array(adr); |
106 | } |
107 | |
108 | void MacroAssembler::call_VM_leaf_base(address entry_point, |
109 | int number_of_arguments) { |
110 | call(RuntimeAddress(entry_point)); |
111 | increment(rsp, number_of_arguments * wordSize); |
112 | } |
113 | |
114 | void MacroAssembler::cmpklass(Address src1, Metadata* obj) { |
115 | cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); |
116 | } |
117 | |
118 | void MacroAssembler::cmpklass(Register src1, Metadata* obj) { |
119 | cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); |
120 | } |
121 | |
122 | void MacroAssembler::cmpoop_raw(Address src1, jobject obj) { |
123 | cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); |
124 | } |
125 | |
126 | void MacroAssembler::cmpoop_raw(Register src1, jobject obj) { |
127 | cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); |
128 | } |
129 | |
130 | void MacroAssembler::cmpoop(Address src1, jobject obj) { |
131 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
132 | bs->obj_equals(this, src1, obj); |
133 | } |
134 | |
135 | void MacroAssembler::cmpoop(Register src1, jobject obj) { |
136 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
137 | bs->obj_equals(this, src1, obj); |
138 | } |
139 | |
140 | void MacroAssembler::extend_sign(Register hi, Register lo) { |
141 | // According to Intel Doc. AP-526, "Integer Divide", p.18. |
142 | if (VM_Version::is_P6() && hi == rdx && lo == rax) { |
143 | cdql(); |
144 | } else { |
145 | movl(hi, lo); |
146 | sarl(hi, 31); |
147 | } |
148 | } |
149 | |
150 | void MacroAssembler::jC2(Register tmp, Label& L) { |
151 | // set parity bit if FPU flag C2 is set (via rax) |
152 | save_rax(tmp); |
153 | fwait(); fnstsw_ax(); |
154 | sahf(); |
155 | restore_rax(tmp); |
156 | // branch |
157 | jcc(Assembler::parity, L); |
158 | } |
159 | |
160 | void MacroAssembler::jnC2(Register tmp, Label& L) { |
161 | // set parity bit if FPU flag C2 is set (via rax) |
162 | save_rax(tmp); |
163 | fwait(); fnstsw_ax(); |
164 | sahf(); |
165 | restore_rax(tmp); |
166 | // branch |
167 | jcc(Assembler::noParity, L); |
168 | } |
169 | |
170 | // 32bit can do a case table jump in one instruction but we no longer allow the base |
171 | // to be installed in the Address class |
172 | void MacroAssembler::jump(ArrayAddress entry) { |
173 | jmp(as_Address(entry)); |
174 | } |
175 | |
176 | // Note: y_lo will be destroyed |
177 | void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { |
178 | // Long compare for Java (semantics as described in JVM spec.) |
179 | Label high, low, done; |
180 | |
181 | cmpl(x_hi, y_hi); |
182 | jcc(Assembler::less, low); |
183 | jcc(Assembler::greater, high); |
184 | // x_hi is the return register |
185 | xorl(x_hi, x_hi); |
186 | cmpl(x_lo, y_lo); |
187 | jcc(Assembler::below, low); |
188 | jcc(Assembler::equal, done); |
189 | |
190 | bind(high); |
191 | xorl(x_hi, x_hi); |
192 | increment(x_hi); |
193 | jmp(done); |
194 | |
195 | bind(low); |
196 | xorl(x_hi, x_hi); |
197 | decrementl(x_hi); |
198 | |
199 | bind(done); |
200 | } |
201 | |
202 | void MacroAssembler::lea(Register dst, AddressLiteral src) { |
203 | mov_literal32(dst, (int32_t)src.target(), src.rspec()); |
204 | } |
205 | |
206 | void MacroAssembler::lea(Address dst, AddressLiteral adr) { |
207 | // leal(dst, as_Address(adr)); |
208 | // see note in movl as to why we must use a move |
209 | mov_literal32(dst, (int32_t) adr.target(), adr.rspec()); |
210 | } |
211 | |
212 | void MacroAssembler::leave() { |
213 | mov(rsp, rbp); |
214 | pop(rbp); |
215 | } |
216 | |
217 | void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { |
218 | // Multiplication of two Java long values stored on the stack |
219 | // as illustrated below. Result is in rdx:rax. |
220 | // |
221 | // rsp ---> [ ?? ] \ \ |
222 | // .... | y_rsp_offset | |
223 | // [ y_lo ] / (in bytes) | x_rsp_offset |
224 | // [ y_hi ] | (in bytes) |
225 | // .... | |
226 | // [ x_lo ] / |
227 | // [ x_hi ] |
228 | // .... |
229 | // |
230 | // Basic idea: lo(result) = lo(x_lo * y_lo) |
231 | // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) |
232 | Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); |
233 | Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); |
234 | Label quick; |
235 | // load x_hi, y_hi and check if quick |
236 | // multiplication is possible |
237 | movl(rbx, x_hi); |
238 | movl(rcx, y_hi); |
239 | movl(rax, rbx); |
240 | orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 |
241 | jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply |
242 | // do full multiplication |
243 | // 1st step |
244 | mull(y_lo); // x_hi * y_lo |
245 | movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, |
246 | // 2nd step |
247 | movl(rax, x_lo); |
248 | mull(rcx); // x_lo * y_hi |
249 | addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, |
250 | // 3rd step |
251 | bind(quick); // note: rbx, = 0 if quick multiply! |
252 | movl(rax, x_lo); |
253 | mull(y_lo); // x_lo * y_lo |
254 | addl(rdx, rbx); // correct hi(x_lo * y_lo) |
255 | } |
256 | |
257 | void MacroAssembler::lneg(Register hi, Register lo) { |
258 | negl(lo); |
259 | adcl(hi, 0); |
260 | negl(hi); |
261 | } |
262 | |
263 | void MacroAssembler::lshl(Register hi, Register lo) { |
264 | // Java shift left long support (semantics as described in JVM spec., p.305) |
265 | // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) |
266 | // shift value is in rcx ! |
267 | assert(hi != rcx, "must not use rcx" ); |
268 | assert(lo != rcx, "must not use rcx" ); |
269 | const Register s = rcx; // shift count |
270 | const int n = BitsPerWord; |
271 | Label L; |
272 | andl(s, 0x3f); // s := s & 0x3f (s < 0x40) |
273 | cmpl(s, n); // if (s < n) |
274 | jcc(Assembler::less, L); // else (s >= n) |
275 | movl(hi, lo); // x := x << n |
276 | xorl(lo, lo); |
277 | // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! |
278 | bind(L); // s (mod n) < n |
279 | shldl(hi, lo); // x := x << s |
280 | shll(lo); |
281 | } |
282 | |
283 | |
284 | void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { |
285 | // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) |
286 | // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) |
287 | assert(hi != rcx, "must not use rcx" ); |
288 | assert(lo != rcx, "must not use rcx" ); |
289 | const Register s = rcx; // shift count |
290 | const int n = BitsPerWord; |
291 | Label L; |
292 | andl(s, 0x3f); // s := s & 0x3f (s < 0x40) |
293 | cmpl(s, n); // if (s < n) |
294 | jcc(Assembler::less, L); // else (s >= n) |
295 | movl(lo, hi); // x := x >> n |
296 | if (sign_extension) sarl(hi, 31); |
297 | else xorl(hi, hi); |
298 | // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! |
299 | bind(L); // s (mod n) < n |
300 | shrdl(lo, hi); // x := x >> s |
301 | if (sign_extension) sarl(hi); |
302 | else shrl(hi); |
303 | } |
304 | |
305 | void MacroAssembler::movoop(Register dst, jobject obj) { |
306 | mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); |
307 | } |
308 | |
309 | void MacroAssembler::movoop(Address dst, jobject obj) { |
310 | mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); |
311 | } |
312 | |
313 | void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { |
314 | mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); |
315 | } |
316 | |
317 | void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { |
318 | mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); |
319 | } |
320 | |
321 | void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { |
322 | // scratch register is not used, |
323 | // it is defined to match parameters of 64-bit version of this method. |
324 | if (src.is_lval()) { |
325 | mov_literal32(dst, (intptr_t)src.target(), src.rspec()); |
326 | } else { |
327 | movl(dst, as_Address(src)); |
328 | } |
329 | } |
330 | |
331 | void MacroAssembler::movptr(ArrayAddress dst, Register src) { |
332 | movl(as_Address(dst), src); |
333 | } |
334 | |
335 | void MacroAssembler::movptr(Register dst, ArrayAddress src) { |
336 | movl(dst, as_Address(src)); |
337 | } |
338 | |
339 | // src should NEVER be a real pointer. Use AddressLiteral for true pointers |
340 | void MacroAssembler::movptr(Address dst, intptr_t src) { |
341 | movl(dst, src); |
342 | } |
343 | |
344 | |
345 | void MacroAssembler::pop_callee_saved_registers() { |
346 | pop(rcx); |
347 | pop(rdx); |
348 | pop(rdi); |
349 | pop(rsi); |
350 | } |
351 | |
352 | void MacroAssembler::pop_fTOS() { |
353 | fld_d(Address(rsp, 0)); |
354 | addl(rsp, 2 * wordSize); |
355 | } |
356 | |
357 | void MacroAssembler::push_callee_saved_registers() { |
358 | push(rsi); |
359 | push(rdi); |
360 | push(rdx); |
361 | push(rcx); |
362 | } |
363 | |
364 | void MacroAssembler::push_fTOS() { |
365 | subl(rsp, 2 * wordSize); |
366 | fstp_d(Address(rsp, 0)); |
367 | } |
368 | |
369 | |
370 | void MacroAssembler::pushoop(jobject obj) { |
371 | push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); |
372 | } |
373 | |
374 | void MacroAssembler::pushklass(Metadata* obj) { |
375 | push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); |
376 | } |
377 | |
378 | void MacroAssembler::pushptr(AddressLiteral src) { |
379 | if (src.is_lval()) { |
380 | push_literal32((int32_t)src.target(), src.rspec()); |
381 | } else { |
382 | pushl(as_Address(src)); |
383 | } |
384 | } |
385 | |
386 | void MacroAssembler::set_word_if_not_zero(Register dst) { |
387 | xorl(dst, dst); |
388 | set_byte_if_not_zero(dst); |
389 | } |
390 | |
391 | static void pass_arg0(MacroAssembler* masm, Register arg) { |
392 | masm->push(arg); |
393 | } |
394 | |
395 | static void pass_arg1(MacroAssembler* masm, Register arg) { |
396 | masm->push(arg); |
397 | } |
398 | |
399 | static void pass_arg2(MacroAssembler* masm, Register arg) { |
400 | masm->push(arg); |
401 | } |
402 | |
403 | static void pass_arg3(MacroAssembler* masm, Register arg) { |
404 | masm->push(arg); |
405 | } |
406 | |
407 | #ifndef PRODUCT |
408 | extern "C" void findpc(intptr_t x); |
409 | #endif |
410 | |
411 | void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { |
412 | // In order to get locks to work, we need to fake a in_VM state |
413 | JavaThread* thread = JavaThread::current(); |
414 | JavaThreadState saved_state = thread->thread_state(); |
415 | thread->set_thread_state(_thread_in_vm); |
416 | if (ShowMessageBoxOnError) { |
417 | JavaThread* thread = JavaThread::current(); |
418 | JavaThreadState saved_state = thread->thread_state(); |
419 | thread->set_thread_state(_thread_in_vm); |
420 | if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { |
421 | ttyLocker ttyl; |
422 | BytecodeCounter::print(); |
423 | } |
424 | // To see where a verify_oop failed, get $ebx+40/X for this frame. |
425 | // This is the value of eip which points to where verify_oop will return. |
426 | if (os::message_box(msg, "Execution stopped, print registers?" )) { |
427 | print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); |
428 | BREAKPOINT; |
429 | } |
430 | } else { |
431 | ttyLocker ttyl; |
432 | ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n" , msg); |
433 | } |
434 | // Don't assert holding the ttyLock |
435 | assert(false, "DEBUG MESSAGE: %s" , msg); |
436 | ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); |
437 | } |
438 | |
439 | void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { |
440 | ttyLocker ttyl; |
441 | FlagSetting fs(Debugging, true); |
442 | tty->print_cr("eip = 0x%08x" , eip); |
443 | #ifndef PRODUCT |
444 | if ((WizardMode || Verbose) && PrintMiscellaneous) { |
445 | tty->cr(); |
446 | findpc(eip); |
447 | tty->cr(); |
448 | } |
449 | #endif |
450 | #define PRINT_REG(rax) \ |
451 | { tty->print("%s = ", #rax); os::print_location(tty, rax); } |
452 | PRINT_REG(rax); |
453 | PRINT_REG(rbx); |
454 | PRINT_REG(rcx); |
455 | PRINT_REG(rdx); |
456 | PRINT_REG(rdi); |
457 | PRINT_REG(rsi); |
458 | PRINT_REG(rbp); |
459 | PRINT_REG(rsp); |
460 | #undef PRINT_REG |
461 | // Print some words near top of staack. |
462 | int* dump_sp = (int*) rsp; |
463 | for (int col1 = 0; col1 < 8; col1++) { |
464 | tty->print("(rsp+0x%03x) 0x%08x: " , (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); |
465 | os::print_location(tty, *dump_sp++); |
466 | } |
467 | for (int row = 0; row < 16; row++) { |
468 | tty->print("(rsp+0x%03x) 0x%08x: " , (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); |
469 | for (int col = 0; col < 8; col++) { |
470 | tty->print(" 0x%08x" , *dump_sp++); |
471 | } |
472 | tty->cr(); |
473 | } |
474 | // Print some instructions around pc: |
475 | Disassembler::decode((address)eip-64, (address)eip); |
476 | tty->print_cr("--------" ); |
477 | Disassembler::decode((address)eip, (address)eip+32); |
478 | } |
479 | |
480 | void MacroAssembler::stop(const char* msg) { |
481 | ExternalAddress message((address)msg); |
482 | // push address of message |
483 | pushptr(message.addr()); |
484 | { Label L; call(L, relocInfo::none); bind(L); } // push eip |
485 | pusha(); // push registers |
486 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); |
487 | hlt(); |
488 | } |
489 | |
490 | void MacroAssembler::warn(const char* msg) { |
491 | push_CPU_state(); |
492 | |
493 | ExternalAddress message((address) msg); |
494 | // push address of message |
495 | pushptr(message.addr()); |
496 | |
497 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); |
498 | addl(rsp, wordSize); // discard argument |
499 | pop_CPU_state(); |
500 | } |
501 | |
502 | void MacroAssembler::print_state() { |
503 | { Label L; call(L, relocInfo::none); bind(L); } // push eip |
504 | pusha(); // push registers |
505 | |
506 | push_CPU_state(); |
507 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); |
508 | pop_CPU_state(); |
509 | |
510 | popa(); |
511 | addl(rsp, wordSize); |
512 | } |
513 | |
514 | #else // _LP64 |
515 | |
516 | // 64 bit versions |
517 | |
518 | Address MacroAssembler::as_Address(AddressLiteral adr) { |
519 | // amd64 always does this as a pc-rel |
520 | // we can be absolute or disp based on the instruction type |
521 | // jmp/call are displacements others are absolute |
522 | assert(!adr.is_lval(), "must be rval" ); |
523 | assert(reachable(adr), "must be" ); |
524 | return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc()); |
525 | |
526 | } |
527 | |
528 | Address MacroAssembler::as_Address(ArrayAddress adr) { |
529 | AddressLiteral base = adr.base(); |
530 | lea(rscratch1, base); |
531 | Address index = adr.index(); |
532 | assert(index._disp == 0, "must not have disp" ); // maybe it can? |
533 | Address array(rscratch1, index._index, index._scale, index._disp); |
534 | return array; |
535 | } |
536 | |
537 | void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { |
538 | Label L, E; |
539 | |
540 | #ifdef _WIN64 |
541 | // Windows always allocates space for it's register args |
542 | assert(num_args <= 4, "only register arguments supported" ); |
543 | subq(rsp, frame::arg_reg_save_area_bytes); |
544 | #endif |
545 | |
546 | // Align stack if necessary |
547 | testl(rsp, 15); |
548 | jcc(Assembler::zero, L); |
549 | |
550 | subq(rsp, 8); |
551 | { |
552 | call(RuntimeAddress(entry_point)); |
553 | } |
554 | addq(rsp, 8); |
555 | jmp(E); |
556 | |
557 | bind(L); |
558 | { |
559 | call(RuntimeAddress(entry_point)); |
560 | } |
561 | |
562 | bind(E); |
563 | |
564 | #ifdef _WIN64 |
565 | // restore stack pointer |
566 | addq(rsp, frame::arg_reg_save_area_bytes); |
567 | #endif |
568 | |
569 | } |
570 | |
571 | void MacroAssembler::cmp64(Register src1, AddressLiteral src2) { |
572 | assert(!src2.is_lval(), "should use cmpptr" ); |
573 | |
574 | if (reachable(src2)) { |
575 | cmpq(src1, as_Address(src2)); |
576 | } else { |
577 | lea(rscratch1, src2); |
578 | Assembler::cmpq(src1, Address(rscratch1, 0)); |
579 | } |
580 | } |
581 | |
582 | int MacroAssembler::corrected_idivq(Register reg) { |
583 | // Full implementation of Java ldiv and lrem; checks for special |
584 | // case as described in JVM spec., p.243 & p.271. The function |
585 | // returns the (pc) offset of the idivl instruction - may be needed |
586 | // for implicit exceptions. |
587 | // |
588 | // normal case special case |
589 | // |
590 | // input : rax: dividend min_long |
591 | // reg: divisor (may not be eax/edx) -1 |
592 | // |
593 | // output: rax: quotient (= rax idiv reg) min_long |
594 | // rdx: remainder (= rax irem reg) 0 |
595 | assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register" ); |
596 | static const int64_t min_long = 0x8000000000000000; |
597 | Label normal_case, special_case; |
598 | |
599 | // check for special case |
600 | cmp64(rax, ExternalAddress((address) &min_long)); |
601 | jcc(Assembler::notEqual, normal_case); |
602 | xorl(rdx, rdx); // prepare rdx for possible special case (where |
603 | // remainder = 0) |
604 | cmpq(reg, -1); |
605 | jcc(Assembler::equal, special_case); |
606 | |
607 | // handle normal case |
608 | bind(normal_case); |
609 | cdqq(); |
610 | int idivq_offset = offset(); |
611 | idivq(reg); |
612 | |
613 | // normal and special case exit |
614 | bind(special_case); |
615 | |
616 | return idivq_offset; |
617 | } |
618 | |
619 | void MacroAssembler::decrementq(Register reg, int value) { |
620 | if (value == min_jint) { subq(reg, value); return; } |
621 | if (value < 0) { incrementq(reg, -value); return; } |
622 | if (value == 0) { ; return; } |
623 | if (value == 1 && UseIncDec) { decq(reg) ; return; } |
624 | /* else */ { subq(reg, value) ; return; } |
625 | } |
626 | |
627 | void MacroAssembler::decrementq(Address dst, int value) { |
628 | if (value == min_jint) { subq(dst, value); return; } |
629 | if (value < 0) { incrementq(dst, -value); return; } |
630 | if (value == 0) { ; return; } |
631 | if (value == 1 && UseIncDec) { decq(dst) ; return; } |
632 | /* else */ { subq(dst, value) ; return; } |
633 | } |
634 | |
635 | void MacroAssembler::incrementq(AddressLiteral dst) { |
636 | if (reachable(dst)) { |
637 | incrementq(as_Address(dst)); |
638 | } else { |
639 | lea(rscratch1, dst); |
640 | incrementq(Address(rscratch1, 0)); |
641 | } |
642 | } |
643 | |
644 | void MacroAssembler::incrementq(Register reg, int value) { |
645 | if (value == min_jint) { addq(reg, value); return; } |
646 | if (value < 0) { decrementq(reg, -value); return; } |
647 | if (value == 0) { ; return; } |
648 | if (value == 1 && UseIncDec) { incq(reg) ; return; } |
649 | /* else */ { addq(reg, value) ; return; } |
650 | } |
651 | |
652 | void MacroAssembler::incrementq(Address dst, int value) { |
653 | if (value == min_jint) { addq(dst, value); return; } |
654 | if (value < 0) { decrementq(dst, -value); return; } |
655 | if (value == 0) { ; return; } |
656 | if (value == 1 && UseIncDec) { incq(dst) ; return; } |
657 | /* else */ { addq(dst, value) ; return; } |
658 | } |
659 | |
660 | // 32bit can do a case table jump in one instruction but we no longer allow the base |
661 | // to be installed in the Address class |
662 | void MacroAssembler::jump(ArrayAddress entry) { |
663 | lea(rscratch1, entry.base()); |
664 | Address dispatch = entry.index(); |
665 | assert(dispatch._base == noreg, "must be" ); |
666 | dispatch._base = rscratch1; |
667 | jmp(dispatch); |
668 | } |
669 | |
670 | void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { |
671 | ShouldNotReachHere(); // 64bit doesn't use two regs |
672 | cmpq(x_lo, y_lo); |
673 | } |
674 | |
675 | void MacroAssembler::lea(Register dst, AddressLiteral src) { |
676 | mov_literal64(dst, (intptr_t)src.target(), src.rspec()); |
677 | } |
678 | |
679 | void MacroAssembler::lea(Address dst, AddressLiteral adr) { |
680 | mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec()); |
681 | movptr(dst, rscratch1); |
682 | } |
683 | |
684 | void MacroAssembler::leave() { |
685 | // %%% is this really better? Why not on 32bit too? |
686 | emit_int8((unsigned char)0xC9); // LEAVE |
687 | } |
688 | |
689 | void MacroAssembler::lneg(Register hi, Register lo) { |
690 | ShouldNotReachHere(); // 64bit doesn't use two regs |
691 | negq(lo); |
692 | } |
693 | |
694 | void MacroAssembler::movoop(Register dst, jobject obj) { |
695 | mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); |
696 | } |
697 | |
698 | void MacroAssembler::movoop(Address dst, jobject obj) { |
699 | mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate()); |
700 | movq(dst, rscratch1); |
701 | } |
702 | |
703 | void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { |
704 | mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); |
705 | } |
706 | |
707 | void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { |
708 | mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); |
709 | movq(dst, rscratch1); |
710 | } |
711 | |
712 | void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { |
713 | if (src.is_lval()) { |
714 | mov_literal64(dst, (intptr_t)src.target(), src.rspec()); |
715 | } else { |
716 | if (reachable(src)) { |
717 | movq(dst, as_Address(src)); |
718 | } else { |
719 | lea(scratch, src); |
720 | movq(dst, Address(scratch, 0)); |
721 | } |
722 | } |
723 | } |
724 | |
725 | void MacroAssembler::movptr(ArrayAddress dst, Register src) { |
726 | movq(as_Address(dst), src); |
727 | } |
728 | |
729 | void MacroAssembler::movptr(Register dst, ArrayAddress src) { |
730 | movq(dst, as_Address(src)); |
731 | } |
732 | |
733 | // src should NEVER be a real pointer. Use AddressLiteral for true pointers |
734 | void MacroAssembler::movptr(Address dst, intptr_t src) { |
735 | mov64(rscratch1, src); |
736 | movq(dst, rscratch1); |
737 | } |
738 | |
739 | // These are mostly for initializing NULL |
740 | void MacroAssembler::movptr(Address dst, int32_t src) { |
741 | movslq(dst, src); |
742 | } |
743 | |
744 | void MacroAssembler::movptr(Register dst, int32_t src) { |
745 | mov64(dst, (intptr_t)src); |
746 | } |
747 | |
748 | void MacroAssembler::pushoop(jobject obj) { |
749 | movoop(rscratch1, obj); |
750 | push(rscratch1); |
751 | } |
752 | |
753 | void MacroAssembler::pushklass(Metadata* obj) { |
754 | mov_metadata(rscratch1, obj); |
755 | push(rscratch1); |
756 | } |
757 | |
758 | void MacroAssembler::pushptr(AddressLiteral src) { |
759 | lea(rscratch1, src); |
760 | if (src.is_lval()) { |
761 | push(rscratch1); |
762 | } else { |
763 | pushq(Address(rscratch1, 0)); |
764 | } |
765 | } |
766 | |
767 | void MacroAssembler::reset_last_Java_frame(bool clear_fp) { |
768 | // we must set sp to zero to clear frame |
769 | movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); |
770 | // must clear fp, so that compiled frames are not confused; it is |
771 | // possible that we need it only for debugging |
772 | if (clear_fp) { |
773 | movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); |
774 | } |
775 | |
776 | // Always clear the pc because it could have been set by make_walkable() |
777 | movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); |
778 | vzeroupper(); |
779 | } |
780 | |
781 | void MacroAssembler::set_last_Java_frame(Register last_java_sp, |
782 | Register last_java_fp, |
783 | address last_java_pc) { |
784 | vzeroupper(); |
785 | // determine last_java_sp register |
786 | if (!last_java_sp->is_valid()) { |
787 | last_java_sp = rsp; |
788 | } |
789 | |
790 | // last_java_fp is optional |
791 | if (last_java_fp->is_valid()) { |
792 | movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), |
793 | last_java_fp); |
794 | } |
795 | |
796 | // last_java_pc is optional |
797 | if (last_java_pc != NULL) { |
798 | Address java_pc(r15_thread, |
799 | JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); |
800 | lea(rscratch1, InternalAddress(last_java_pc)); |
801 | movptr(java_pc, rscratch1); |
802 | } |
803 | |
804 | movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp); |
805 | } |
806 | |
807 | static void pass_arg0(MacroAssembler* masm, Register arg) { |
808 | if (c_rarg0 != arg ) { |
809 | masm->mov(c_rarg0, arg); |
810 | } |
811 | } |
812 | |
813 | static void pass_arg1(MacroAssembler* masm, Register arg) { |
814 | if (c_rarg1 != arg ) { |
815 | masm->mov(c_rarg1, arg); |
816 | } |
817 | } |
818 | |
819 | static void pass_arg2(MacroAssembler* masm, Register arg) { |
820 | if (c_rarg2 != arg ) { |
821 | masm->mov(c_rarg2, arg); |
822 | } |
823 | } |
824 | |
825 | static void pass_arg3(MacroAssembler* masm, Register arg) { |
826 | if (c_rarg3 != arg ) { |
827 | masm->mov(c_rarg3, arg); |
828 | } |
829 | } |
830 | |
831 | void MacroAssembler::stop(const char* msg) { |
832 | address rip = pc(); |
833 | pusha(); // get regs on stack |
834 | lea(c_rarg0, ExternalAddress((address) msg)); |
835 | lea(c_rarg1, InternalAddress(rip)); |
836 | movq(c_rarg2, rsp); // pass pointer to regs array |
837 | andq(rsp, -16); // align stack as required by ABI |
838 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); |
839 | hlt(); |
840 | } |
841 | |
842 | void MacroAssembler::warn(const char* msg) { |
843 | push(rbp); |
844 | movq(rbp, rsp); |
845 | andq(rsp, -16); // align stack as required by push_CPU_state and call |
846 | push_CPU_state(); // keeps alignment at 16 bytes |
847 | lea(c_rarg0, ExternalAddress((address) msg)); |
848 | lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning))); |
849 | call(rax); |
850 | pop_CPU_state(); |
851 | mov(rsp, rbp); |
852 | pop(rbp); |
853 | } |
854 | |
855 | void MacroAssembler::print_state() { |
856 | address rip = pc(); |
857 | pusha(); // get regs on stack |
858 | push(rbp); |
859 | movq(rbp, rsp); |
860 | andq(rsp, -16); // align stack as required by push_CPU_state and call |
861 | push_CPU_state(); // keeps alignment at 16 bytes |
862 | |
863 | lea(c_rarg0, InternalAddress(rip)); |
864 | lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array |
865 | call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); |
866 | |
867 | pop_CPU_state(); |
868 | mov(rsp, rbp); |
869 | pop(rbp); |
870 | popa(); |
871 | } |
872 | |
873 | #ifndef PRODUCT |
874 | extern "C" void findpc(intptr_t x); |
875 | #endif |
876 | |
877 | void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { |
878 | // In order to get locks to work, we need to fake a in_VM state |
879 | if (ShowMessageBoxOnError) { |
880 | JavaThread* thread = JavaThread::current(); |
881 | JavaThreadState saved_state = thread->thread_state(); |
882 | thread->set_thread_state(_thread_in_vm); |
883 | #ifndef PRODUCT |
884 | if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { |
885 | ttyLocker ttyl; |
886 | BytecodeCounter::print(); |
887 | } |
888 | #endif |
889 | // To see where a verify_oop failed, get $ebx+40/X for this frame. |
890 | // XXX correct this offset for amd64 |
891 | // This is the value of eip which points to where verify_oop will return. |
892 | if (os::message_box(msg, "Execution stopped, print registers?" )) { |
893 | print_state64(pc, regs); |
894 | BREAKPOINT; |
895 | assert(false, "start up GDB" ); |
896 | } |
897 | ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); |
898 | } else { |
899 | ttyLocker ttyl; |
900 | ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n" , |
901 | msg); |
902 | assert(false, "DEBUG MESSAGE: %s" , msg); |
903 | } |
904 | } |
905 | |
906 | void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { |
907 | ttyLocker ttyl; |
908 | FlagSetting fs(Debugging, true); |
909 | tty->print_cr("rip = 0x%016lx" , (intptr_t)pc); |
910 | #ifndef PRODUCT |
911 | tty->cr(); |
912 | findpc(pc); |
913 | tty->cr(); |
914 | #endif |
915 | #define PRINT_REG(rax, value) \ |
916 | { tty->print("%s = ", #rax); os::print_location(tty, value); } |
917 | PRINT_REG(rax, regs[15]); |
918 | PRINT_REG(rbx, regs[12]); |
919 | PRINT_REG(rcx, regs[14]); |
920 | PRINT_REG(rdx, regs[13]); |
921 | PRINT_REG(rdi, regs[8]); |
922 | PRINT_REG(rsi, regs[9]); |
923 | PRINT_REG(rbp, regs[10]); |
924 | PRINT_REG(rsp, regs[11]); |
925 | PRINT_REG(r8 , regs[7]); |
926 | PRINT_REG(r9 , regs[6]); |
927 | PRINT_REG(r10, regs[5]); |
928 | PRINT_REG(r11, regs[4]); |
929 | PRINT_REG(r12, regs[3]); |
930 | PRINT_REG(r13, regs[2]); |
931 | PRINT_REG(r14, regs[1]); |
932 | PRINT_REG(r15, regs[0]); |
933 | #undef PRINT_REG |
934 | // Print some words near top of staack. |
935 | int64_t* rsp = (int64_t*) regs[11]; |
936 | int64_t* dump_sp = rsp; |
937 | for (int col1 = 0; col1 < 8; col1++) { |
938 | tty->print("(rsp+0x%03x) 0x%016lx: " , (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); |
939 | os::print_location(tty, *dump_sp++); |
940 | } |
941 | for (int row = 0; row < 25; row++) { |
942 | tty->print("(rsp+0x%03x) 0x%016lx: " , (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); |
943 | for (int col = 0; col < 4; col++) { |
944 | tty->print(" 0x%016lx" , (intptr_t)*dump_sp++); |
945 | } |
946 | tty->cr(); |
947 | } |
948 | // Print some instructions around pc: |
949 | Disassembler::decode((address)pc-64, (address)pc); |
950 | tty->print_cr("--------" ); |
951 | Disassembler::decode((address)pc, (address)pc+32); |
952 | } |
953 | |
954 | #endif // _LP64 |
955 | |
956 | // Now versions that are common to 32/64 bit |
957 | |
958 | void MacroAssembler::addptr(Register dst, int32_t imm32) { |
959 | LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); |
960 | } |
961 | |
962 | void MacroAssembler::addptr(Register dst, Register src) { |
963 | LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); |
964 | } |
965 | |
966 | void MacroAssembler::addptr(Address dst, Register src) { |
967 | LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); |
968 | } |
969 | |
970 | void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) { |
971 | if (reachable(src)) { |
972 | Assembler::addsd(dst, as_Address(src)); |
973 | } else { |
974 | lea(rscratch1, src); |
975 | Assembler::addsd(dst, Address(rscratch1, 0)); |
976 | } |
977 | } |
978 | |
979 | void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) { |
980 | if (reachable(src)) { |
981 | addss(dst, as_Address(src)); |
982 | } else { |
983 | lea(rscratch1, src); |
984 | addss(dst, Address(rscratch1, 0)); |
985 | } |
986 | } |
987 | |
988 | void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) { |
989 | if (reachable(src)) { |
990 | Assembler::addpd(dst, as_Address(src)); |
991 | } else { |
992 | lea(rscratch1, src); |
993 | Assembler::addpd(dst, Address(rscratch1, 0)); |
994 | } |
995 | } |
996 | |
997 | void MacroAssembler::align(int modulus) { |
998 | align(modulus, offset()); |
999 | } |
1000 | |
1001 | void MacroAssembler::align(int modulus, int target) { |
1002 | if (target % modulus != 0) { |
1003 | nop(modulus - (target % modulus)); |
1004 | } |
1005 | } |
1006 | |
1007 | void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { |
1008 | // Used in sign-masking with aligned address. |
1009 | assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes" ); |
1010 | if (reachable(src)) { |
1011 | Assembler::andpd(dst, as_Address(src)); |
1012 | } else { |
1013 | lea(scratch_reg, src); |
1014 | Assembler::andpd(dst, Address(scratch_reg, 0)); |
1015 | } |
1016 | } |
1017 | |
1018 | void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { |
1019 | // Used in sign-masking with aligned address. |
1020 | assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes" ); |
1021 | if (reachable(src)) { |
1022 | Assembler::andps(dst, as_Address(src)); |
1023 | } else { |
1024 | lea(scratch_reg, src); |
1025 | Assembler::andps(dst, Address(scratch_reg, 0)); |
1026 | } |
1027 | } |
1028 | |
1029 | void MacroAssembler::andptr(Register dst, int32_t imm32) { |
1030 | LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); |
1031 | } |
1032 | |
1033 | void MacroAssembler::atomic_incl(Address counter_addr) { |
1034 | lock(); |
1035 | incrementl(counter_addr); |
1036 | } |
1037 | |
1038 | void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) { |
1039 | if (reachable(counter_addr)) { |
1040 | atomic_incl(as_Address(counter_addr)); |
1041 | } else { |
1042 | lea(scr, counter_addr); |
1043 | atomic_incl(Address(scr, 0)); |
1044 | } |
1045 | } |
1046 | |
1047 | #ifdef _LP64 |
1048 | void MacroAssembler::atomic_incq(Address counter_addr) { |
1049 | lock(); |
1050 | incrementq(counter_addr); |
1051 | } |
1052 | |
1053 | void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) { |
1054 | if (reachable(counter_addr)) { |
1055 | atomic_incq(as_Address(counter_addr)); |
1056 | } else { |
1057 | lea(scr, counter_addr); |
1058 | atomic_incq(Address(scr, 0)); |
1059 | } |
1060 | } |
1061 | #endif |
1062 | |
1063 | // Writes to stack successive pages until offset reached to check for |
1064 | // stack overflow + shadow pages. This clobbers tmp. |
1065 | void MacroAssembler::bang_stack_size(Register size, Register tmp) { |
1066 | movptr(tmp, rsp); |
1067 | // Bang stack for total size given plus shadow page size. |
1068 | // Bang one page at a time because large size can bang beyond yellow and |
1069 | // red zones. |
1070 | Label loop; |
1071 | bind(loop); |
1072 | movl(Address(tmp, (-os::vm_page_size())), size ); |
1073 | subptr(tmp, os::vm_page_size()); |
1074 | subl(size, os::vm_page_size()); |
1075 | jcc(Assembler::greater, loop); |
1076 | |
1077 | // Bang down shadow pages too. |
1078 | // At this point, (tmp-0) is the last address touched, so don't |
1079 | // touch it again. (It was touched as (tmp-pagesize) but then tmp |
1080 | // was post-decremented.) Skip this address by starting at i=1, and |
1081 | // touch a few more pages below. N.B. It is important to touch all |
1082 | // the way down including all pages in the shadow zone. |
1083 | for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) { |
1084 | // this could be any sized move but this is can be a debugging crumb |
1085 | // so the bigger the better. |
1086 | movptr(Address(tmp, (-i*os::vm_page_size())), size ); |
1087 | } |
1088 | } |
1089 | |
1090 | void MacroAssembler::reserved_stack_check() { |
1091 | // testing if reserved zone needs to be enabled |
1092 | Label no_reserved_zone_enabling; |
1093 | Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); |
1094 | NOT_LP64(get_thread(rsi);) |
1095 | |
1096 | cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); |
1097 | jcc(Assembler::below, no_reserved_zone_enabling); |
1098 | |
1099 | call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); |
1100 | jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); |
1101 | should_not_reach_here(); |
1102 | |
1103 | bind(no_reserved_zone_enabling); |
1104 | } |
1105 | |
1106 | int MacroAssembler::biased_locking_enter(Register lock_reg, |
1107 | Register obj_reg, |
1108 | Register swap_reg, |
1109 | Register tmp_reg, |
1110 | bool swap_reg_contains_mark, |
1111 | Label& done, |
1112 | Label* slow_case, |
1113 | BiasedLockingCounters* counters) { |
1114 | assert(UseBiasedLocking, "why call this otherwise?" ); |
1115 | assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq" ); |
1116 | assert(tmp_reg != noreg, "tmp_reg must be supplied" ); |
1117 | assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); |
1118 | assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout" ); |
1119 | Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); |
1120 | NOT_LP64( Address saved_mark_addr(lock_reg, 0); ) |
1121 | |
1122 | if (PrintBiasedLockingStatistics && counters == NULL) { |
1123 | counters = BiasedLocking::counters(); |
1124 | } |
1125 | // Biased locking |
1126 | // See whether the lock is currently biased toward our thread and |
1127 | // whether the epoch is still valid |
1128 | // Note that the runtime guarantees sufficient alignment of JavaThread |
1129 | // pointers to allow age to be placed into low bits |
1130 | // First check to see whether biasing is even enabled for this object |
1131 | Label cas_label; |
1132 | int null_check_offset = -1; |
1133 | if (!swap_reg_contains_mark) { |
1134 | null_check_offset = offset(); |
1135 | movptr(swap_reg, mark_addr); |
1136 | } |
1137 | movptr(tmp_reg, swap_reg); |
1138 | andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place); |
1139 | cmpptr(tmp_reg, markOopDesc::biased_lock_pattern); |
1140 | jcc(Assembler::notEqual, cas_label); |
1141 | // The bias pattern is present in the object's header. Need to check |
1142 | // whether the bias owner and the epoch are both still current. |
1143 | #ifndef _LP64 |
1144 | // Note that because there is no current thread register on x86_32 we |
1145 | // need to store off the mark word we read out of the object to |
1146 | // avoid reloading it and needing to recheck invariants below. This |
1147 | // store is unfortunate but it makes the overall code shorter and |
1148 | // simpler. |
1149 | movptr(saved_mark_addr, swap_reg); |
1150 | #endif |
1151 | if (swap_reg_contains_mark) { |
1152 | null_check_offset = offset(); |
1153 | } |
1154 | load_prototype_header(tmp_reg, obj_reg); |
1155 | #ifdef _LP64 |
1156 | orptr(tmp_reg, r15_thread); |
1157 | xorptr(tmp_reg, swap_reg); |
1158 | Register = tmp_reg; |
1159 | #else |
1160 | xorptr(tmp_reg, swap_reg); |
1161 | get_thread(swap_reg); |
1162 | xorptr(swap_reg, tmp_reg); |
1163 | Register header_reg = swap_reg; |
1164 | #endif |
1165 | andptr(header_reg, ~((int) markOopDesc::age_mask_in_place)); |
1166 | if (counters != NULL) { |
1167 | cond_inc32(Assembler::zero, |
1168 | ExternalAddress((address) counters->biased_lock_entry_count_addr())); |
1169 | } |
1170 | jcc(Assembler::equal, done); |
1171 | |
1172 | Label try_revoke_bias; |
1173 | Label try_rebias; |
1174 | |
1175 | // At this point we know that the header has the bias pattern and |
1176 | // that we are not the bias owner in the current epoch. We need to |
1177 | // figure out more details about the state of the header in order to |
1178 | // know what operations can be legally performed on the object's |
1179 | // header. |
1180 | |
1181 | // If the low three bits in the xor result aren't clear, that means |
1182 | // the prototype header is no longer biased and we have to revoke |
1183 | // the bias on this object. |
1184 | testptr(header_reg, markOopDesc::biased_lock_mask_in_place); |
1185 | jccb(Assembler::notZero, try_revoke_bias); |
1186 | |
1187 | // Biasing is still enabled for this data type. See whether the |
1188 | // epoch of the current bias is still valid, meaning that the epoch |
1189 | // bits of the mark word are equal to the epoch bits of the |
1190 | // prototype header. (Note that the prototype header's epoch bits |
1191 | // only change at a safepoint.) If not, attempt to rebias the object |
1192 | // toward the current thread. Note that we must be absolutely sure |
1193 | // that the current epoch is invalid in order to do this because |
1194 | // otherwise the manipulations it performs on the mark word are |
1195 | // illegal. |
1196 | testptr(header_reg, markOopDesc::epoch_mask_in_place); |
1197 | jccb(Assembler::notZero, try_rebias); |
1198 | |
1199 | // The epoch of the current bias is still valid but we know nothing |
1200 | // about the owner; it might be set or it might be clear. Try to |
1201 | // acquire the bias of the object using an atomic operation. If this |
1202 | // fails we will go in to the runtime to revoke the object's bias. |
1203 | // Note that we first construct the presumed unbiased header so we |
1204 | // don't accidentally blow away another thread's valid bias. |
1205 | NOT_LP64( movptr(swap_reg, saved_mark_addr); ) |
1206 | andptr(swap_reg, |
1207 | markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); |
1208 | #ifdef _LP64 |
1209 | movptr(tmp_reg, swap_reg); |
1210 | orptr(tmp_reg, r15_thread); |
1211 | #else |
1212 | get_thread(tmp_reg); |
1213 | orptr(tmp_reg, swap_reg); |
1214 | #endif |
1215 | lock(); |
1216 | cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg |
1217 | // If the biasing toward our thread failed, this means that |
1218 | // another thread succeeded in biasing it toward itself and we |
1219 | // need to revoke that bias. The revocation will occur in the |
1220 | // interpreter runtime in the slow case. |
1221 | if (counters != NULL) { |
1222 | cond_inc32(Assembler::zero, |
1223 | ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); |
1224 | } |
1225 | if (slow_case != NULL) { |
1226 | jcc(Assembler::notZero, *slow_case); |
1227 | } |
1228 | jmp(done); |
1229 | |
1230 | bind(try_rebias); |
1231 | // At this point we know the epoch has expired, meaning that the |
1232 | // current "bias owner", if any, is actually invalid. Under these |
1233 | // circumstances _only_, we are allowed to use the current header's |
1234 | // value as the comparison value when doing the cas to acquire the |
1235 | // bias in the current epoch. In other words, we allow transfer of |
1236 | // the bias from one thread to another directly in this situation. |
1237 | // |
1238 | // FIXME: due to a lack of registers we currently blow away the age |
1239 | // bits in this situation. Should attempt to preserve them. |
1240 | load_prototype_header(tmp_reg, obj_reg); |
1241 | #ifdef _LP64 |
1242 | orptr(tmp_reg, r15_thread); |
1243 | #else |
1244 | get_thread(swap_reg); |
1245 | orptr(tmp_reg, swap_reg); |
1246 | movptr(swap_reg, saved_mark_addr); |
1247 | #endif |
1248 | lock(); |
1249 | cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg |
1250 | // If the biasing toward our thread failed, then another thread |
1251 | // succeeded in biasing it toward itself and we need to revoke that |
1252 | // bias. The revocation will occur in the runtime in the slow case. |
1253 | if (counters != NULL) { |
1254 | cond_inc32(Assembler::zero, |
1255 | ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); |
1256 | } |
1257 | if (slow_case != NULL) { |
1258 | jcc(Assembler::notZero, *slow_case); |
1259 | } |
1260 | jmp(done); |
1261 | |
1262 | bind(try_revoke_bias); |
1263 | // The prototype mark in the klass doesn't have the bias bit set any |
1264 | // more, indicating that objects of this data type are not supposed |
1265 | // to be biased any more. We are going to try to reset the mark of |
1266 | // this object to the prototype value and fall through to the |
1267 | // CAS-based locking scheme. Note that if our CAS fails, it means |
1268 | // that another thread raced us for the privilege of revoking the |
1269 | // bias of this particular object, so it's okay to continue in the |
1270 | // normal locking code. |
1271 | // |
1272 | // FIXME: due to a lack of registers we currently blow away the age |
1273 | // bits in this situation. Should attempt to preserve them. |
1274 | NOT_LP64( movptr(swap_reg, saved_mark_addr); ) |
1275 | load_prototype_header(tmp_reg, obj_reg); |
1276 | lock(); |
1277 | cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg |
1278 | // Fall through to the normal CAS-based lock, because no matter what |
1279 | // the result of the above CAS, some thread must have succeeded in |
1280 | // removing the bias bit from the object's header. |
1281 | if (counters != NULL) { |
1282 | cond_inc32(Assembler::zero, |
1283 | ExternalAddress((address) counters->revoked_lock_entry_count_addr())); |
1284 | } |
1285 | |
1286 | bind(cas_label); |
1287 | |
1288 | return null_check_offset; |
1289 | } |
1290 | |
1291 | void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { |
1292 | assert(UseBiasedLocking, "why call this otherwise?" ); |
1293 | |
1294 | // Check for biased locking unlock case, which is a no-op |
1295 | // Note: we do not have to check the thread ID for two reasons. |
1296 | // First, the interpreter checks for IllegalMonitorStateException at |
1297 | // a higher level. Second, if the bias was revoked while we held the |
1298 | // lock, the object could not be rebiased toward another thread, so |
1299 | // the bias bit would be clear. |
1300 | movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); |
1301 | andptr(temp_reg, markOopDesc::biased_lock_mask_in_place); |
1302 | cmpptr(temp_reg, markOopDesc::biased_lock_pattern); |
1303 | jcc(Assembler::equal, done); |
1304 | } |
1305 | |
1306 | #ifdef COMPILER2 |
1307 | |
1308 | #if INCLUDE_RTM_OPT |
1309 | |
1310 | // Update rtm_counters based on abort status |
1311 | // input: abort_status |
1312 | // rtm_counters (RTMLockingCounters*) |
1313 | // flags are killed |
1314 | void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { |
1315 | |
1316 | atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); |
1317 | if (PrintPreciseRTMLockingStatistics) { |
1318 | for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { |
1319 | Label check_abort; |
1320 | testl(abort_status, (1<<i)); |
1321 | jccb(Assembler::equal, check_abort); |
1322 | atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); |
1323 | bind(check_abort); |
1324 | } |
1325 | } |
1326 | } |
1327 | |
1328 | // Branch if (random & (count-1) != 0), count is 2^n |
1329 | // tmp, scr and flags are killed |
1330 | void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { |
1331 | assert(tmp == rax, "" ); |
1332 | assert(scr == rdx, "" ); |
1333 | rdtsc(); // modifies EDX:EAX |
1334 | andptr(tmp, count-1); |
1335 | jccb(Assembler::notZero, brLabel); |
1336 | } |
1337 | |
1338 | // Perform abort ratio calculation, set no_rtm bit if high ratio |
1339 | // input: rtm_counters_Reg (RTMLockingCounters* address) |
1340 | // tmpReg, rtm_counters_Reg and flags are killed |
1341 | void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, |
1342 | Register rtm_counters_Reg, |
1343 | RTMLockingCounters* rtm_counters, |
1344 | Metadata* method_data) { |
1345 | Label L_done, L_check_always_rtm1, L_check_always_rtm2; |
1346 | |
1347 | if (RTMLockingCalculationDelay > 0) { |
1348 | // Delay calculation |
1349 | movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); |
1350 | testptr(tmpReg, tmpReg); |
1351 | jccb(Assembler::equal, L_done); |
1352 | } |
1353 | // Abort ratio calculation only if abort_count > RTMAbortThreshold |
1354 | // Aborted transactions = abort_count * 100 |
1355 | // All transactions = total_count * RTMTotalCountIncrRate |
1356 | // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) |
1357 | |
1358 | movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); |
1359 | cmpptr(tmpReg, RTMAbortThreshold); |
1360 | jccb(Assembler::below, L_check_always_rtm2); |
1361 | imulptr(tmpReg, tmpReg, 100); |
1362 | |
1363 | Register scrReg = rtm_counters_Reg; |
1364 | movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); |
1365 | imulptr(scrReg, scrReg, RTMTotalCountIncrRate); |
1366 | imulptr(scrReg, scrReg, RTMAbortRatio); |
1367 | cmpptr(tmpReg, scrReg); |
1368 | jccb(Assembler::below, L_check_always_rtm1); |
1369 | if (method_data != NULL) { |
1370 | // set rtm_state to "no rtm" in MDO |
1371 | mov_metadata(tmpReg, method_data); |
1372 | lock(); |
1373 | orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); |
1374 | } |
1375 | jmpb(L_done); |
1376 | bind(L_check_always_rtm1); |
1377 | // Reload RTMLockingCounters* address |
1378 | lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); |
1379 | bind(L_check_always_rtm2); |
1380 | movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); |
1381 | cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); |
1382 | jccb(Assembler::below, L_done); |
1383 | if (method_data != NULL) { |
1384 | // set rtm_state to "always rtm" in MDO |
1385 | mov_metadata(tmpReg, method_data); |
1386 | lock(); |
1387 | orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); |
1388 | } |
1389 | bind(L_done); |
1390 | } |
1391 | |
1392 | // Update counters and perform abort ratio calculation |
1393 | // input: abort_status_Reg |
1394 | // rtm_counters_Reg, flags are killed |
1395 | void MacroAssembler::rtm_profiling(Register abort_status_Reg, |
1396 | Register rtm_counters_Reg, |
1397 | RTMLockingCounters* rtm_counters, |
1398 | Metadata* method_data, |
1399 | bool profile_rtm) { |
1400 | |
1401 | assert(rtm_counters != NULL, "should not be NULL when profiling RTM" ); |
1402 | // update rtm counters based on rax value at abort |
1403 | // reads abort_status_Reg, updates flags |
1404 | lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); |
1405 | rtm_counters_update(abort_status_Reg, rtm_counters_Reg); |
1406 | if (profile_rtm) { |
1407 | // Save abort status because abort_status_Reg is used by following code. |
1408 | if (RTMRetryCount > 0) { |
1409 | push(abort_status_Reg); |
1410 | } |
1411 | assert(rtm_counters != NULL, "should not be NULL when profiling RTM" ); |
1412 | rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); |
1413 | // restore abort status |
1414 | if (RTMRetryCount > 0) { |
1415 | pop(abort_status_Reg); |
1416 | } |
1417 | } |
1418 | } |
1419 | |
1420 | // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) |
1421 | // inputs: retry_count_Reg |
1422 | // : abort_status_Reg |
1423 | // output: retry_count_Reg decremented by 1 |
1424 | // flags are killed |
1425 | void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { |
1426 | Label doneRetry; |
1427 | assert(abort_status_Reg == rax, "" ); |
1428 | // The abort reason bits are in eax (see all states in rtmLocking.hpp) |
1429 | // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) |
1430 | // if reason is in 0x6 and retry count != 0 then retry |
1431 | andptr(abort_status_Reg, 0x6); |
1432 | jccb(Assembler::zero, doneRetry); |
1433 | testl(retry_count_Reg, retry_count_Reg); |
1434 | jccb(Assembler::zero, doneRetry); |
1435 | pause(); |
1436 | decrementl(retry_count_Reg); |
1437 | jmp(retryLabel); |
1438 | bind(doneRetry); |
1439 | } |
1440 | |
1441 | // Spin and retry if lock is busy, |
1442 | // inputs: box_Reg (monitor address) |
1443 | // : retry_count_Reg |
1444 | // output: retry_count_Reg decremented by 1 |
1445 | // : clear z flag if retry count exceeded |
1446 | // tmp_Reg, scr_Reg, flags are killed |
1447 | void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, |
1448 | Register tmp_Reg, Register scr_Reg, Label& retryLabel) { |
1449 | Label SpinLoop, SpinExit, doneRetry; |
1450 | int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); |
1451 | |
1452 | testl(retry_count_Reg, retry_count_Reg); |
1453 | jccb(Assembler::zero, doneRetry); |
1454 | decrementl(retry_count_Reg); |
1455 | movptr(scr_Reg, RTMSpinLoopCount); |
1456 | |
1457 | bind(SpinLoop); |
1458 | pause(); |
1459 | decrementl(scr_Reg); |
1460 | jccb(Assembler::lessEqual, SpinExit); |
1461 | movptr(tmp_Reg, Address(box_Reg, owner_offset)); |
1462 | testptr(tmp_Reg, tmp_Reg); |
1463 | jccb(Assembler::notZero, SpinLoop); |
1464 | |
1465 | bind(SpinExit); |
1466 | jmp(retryLabel); |
1467 | bind(doneRetry); |
1468 | incrementl(retry_count_Reg); // clear z flag |
1469 | } |
1470 | |
1471 | // Use RTM for normal stack locks |
1472 | // Input: objReg (object to lock) |
1473 | void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, |
1474 | Register retry_on_abort_count_Reg, |
1475 | RTMLockingCounters* stack_rtm_counters, |
1476 | Metadata* method_data, bool profile_rtm, |
1477 | Label& DONE_LABEL, Label& IsInflated) { |
1478 | assert(UseRTMForStackLocks, "why call this otherwise?" ); |
1479 | assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking" ); |
1480 | assert(tmpReg == rax, "" ); |
1481 | assert(scrReg == rdx, "" ); |
1482 | Label L_rtm_retry, L_decrement_retry, L_on_abort; |
1483 | |
1484 | if (RTMRetryCount > 0) { |
1485 | movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort |
1486 | bind(L_rtm_retry); |
1487 | } |
1488 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); |
1489 | testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased |
1490 | jcc(Assembler::notZero, IsInflated); |
1491 | |
1492 | if (PrintPreciseRTMLockingStatistics || profile_rtm) { |
1493 | Label L_noincrement; |
1494 | if (RTMTotalCountIncrRate > 1) { |
1495 | // tmpReg, scrReg and flags are killed |
1496 | branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); |
1497 | } |
1498 | assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM" ); |
1499 | atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); |
1500 | bind(L_noincrement); |
1501 | } |
1502 | xbegin(L_on_abort); |
1503 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword |
1504 | andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits |
1505 | cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked |
1506 | jcc(Assembler::equal, DONE_LABEL); // all done if unlocked |
1507 | |
1508 | Register abort_status_Reg = tmpReg; // status of abort is stored in RAX |
1509 | if (UseRTMXendForLockBusy) { |
1510 | xend(); |
1511 | movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) |
1512 | jmp(L_decrement_retry); |
1513 | } |
1514 | else { |
1515 | xabort(0); |
1516 | } |
1517 | bind(L_on_abort); |
1518 | if (PrintPreciseRTMLockingStatistics || profile_rtm) { |
1519 | rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); |
1520 | } |
1521 | bind(L_decrement_retry); |
1522 | if (RTMRetryCount > 0) { |
1523 | // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) |
1524 | rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); |
1525 | } |
1526 | } |
1527 | |
1528 | // Use RTM for inflating locks |
1529 | // inputs: objReg (object to lock) |
1530 | // boxReg (on-stack box address (displaced header location) - KILLED) |
1531 | // tmpReg (ObjectMonitor address + markOopDesc::monitor_value) |
1532 | void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, |
1533 | Register scrReg, Register retry_on_busy_count_Reg, |
1534 | Register retry_on_abort_count_Reg, |
1535 | RTMLockingCounters* rtm_counters, |
1536 | Metadata* method_data, bool profile_rtm, |
1537 | Label& DONE_LABEL) { |
1538 | assert(UseRTMLocking, "why call this otherwise?" ); |
1539 | assert(tmpReg == rax, "" ); |
1540 | assert(scrReg == rdx, "" ); |
1541 | Label L_rtm_retry, L_decrement_retry, L_on_abort; |
1542 | int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); |
1543 | |
1544 | // Without cast to int32_t a movptr will destroy r10 which is typically obj |
1545 | movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); |
1546 | movptr(boxReg, tmpReg); // Save ObjectMonitor address |
1547 | |
1548 | if (RTMRetryCount > 0) { |
1549 | movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy |
1550 | movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort |
1551 | bind(L_rtm_retry); |
1552 | } |
1553 | if (PrintPreciseRTMLockingStatistics || profile_rtm) { |
1554 | Label L_noincrement; |
1555 | if (RTMTotalCountIncrRate > 1) { |
1556 | // tmpReg, scrReg and flags are killed |
1557 | branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); |
1558 | } |
1559 | assert(rtm_counters != NULL, "should not be NULL when profiling RTM" ); |
1560 | atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); |
1561 | bind(L_noincrement); |
1562 | } |
1563 | xbegin(L_on_abort); |
1564 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); |
1565 | movptr(tmpReg, Address(tmpReg, owner_offset)); |
1566 | testptr(tmpReg, tmpReg); |
1567 | jcc(Assembler::zero, DONE_LABEL); |
1568 | if (UseRTMXendForLockBusy) { |
1569 | xend(); |
1570 | jmp(L_decrement_retry); |
1571 | } |
1572 | else { |
1573 | xabort(0); |
1574 | } |
1575 | bind(L_on_abort); |
1576 | Register abort_status_Reg = tmpReg; // status of abort is stored in RAX |
1577 | if (PrintPreciseRTMLockingStatistics || profile_rtm) { |
1578 | rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); |
1579 | } |
1580 | if (RTMRetryCount > 0) { |
1581 | // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) |
1582 | rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); |
1583 | } |
1584 | |
1585 | movptr(tmpReg, Address(boxReg, owner_offset)) ; |
1586 | testptr(tmpReg, tmpReg) ; |
1587 | jccb(Assembler::notZero, L_decrement_retry) ; |
1588 | |
1589 | // Appears unlocked - try to swing _owner from null to non-null. |
1590 | // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. |
1591 | #ifdef _LP64 |
1592 | Register threadReg = r15_thread; |
1593 | #else |
1594 | get_thread(scrReg); |
1595 | Register threadReg = scrReg; |
1596 | #endif |
1597 | lock(); |
1598 | cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg |
1599 | |
1600 | if (RTMRetryCount > 0) { |
1601 | // success done else retry |
1602 | jccb(Assembler::equal, DONE_LABEL) ; |
1603 | bind(L_decrement_retry); |
1604 | // Spin and retry if lock is busy. |
1605 | rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); |
1606 | } |
1607 | else { |
1608 | bind(L_decrement_retry); |
1609 | } |
1610 | } |
1611 | |
1612 | #endif // INCLUDE_RTM_OPT |
1613 | |
1614 | // Fast_Lock and Fast_Unlock used by C2 |
1615 | |
1616 | // Because the transitions from emitted code to the runtime |
1617 | // monitorenter/exit helper stubs are so slow it's critical that |
1618 | // we inline both the stack-locking fast-path and the inflated fast path. |
1619 | // |
1620 | // See also: cmpFastLock and cmpFastUnlock. |
1621 | // |
1622 | // What follows is a specialized inline transliteration of the code |
1623 | // in slow_enter() and slow_exit(). If we're concerned about I$ bloat |
1624 | // another option would be to emit TrySlowEnter and TrySlowExit methods |
1625 | // at startup-time. These methods would accept arguments as |
1626 | // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure |
1627 | // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply |
1628 | // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. |
1629 | // In practice, however, the # of lock sites is bounded and is usually small. |
1630 | // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer |
1631 | // if the processor uses simple bimodal branch predictors keyed by EIP |
1632 | // Since the helper routines would be called from multiple synchronization |
1633 | // sites. |
1634 | // |
1635 | // An even better approach would be write "MonitorEnter()" and "MonitorExit()" |
1636 | // in java - using j.u.c and unsafe - and just bind the lock and unlock sites |
1637 | // to those specialized methods. That'd give us a mostly platform-independent |
1638 | // implementation that the JITs could optimize and inline at their pleasure. |
1639 | // Done correctly, the only time we'd need to cross to native could would be |
1640 | // to park() or unpark() threads. We'd also need a few more unsafe operators |
1641 | // to (a) prevent compiler-JIT reordering of non-volatile accesses, and |
1642 | // (b) explicit barriers or fence operations. |
1643 | // |
1644 | // TODO: |
1645 | // |
1646 | // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). |
1647 | // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. |
1648 | // Given TLAB allocation, Self is usually manifested in a register, so passing it into |
1649 | // the lock operators would typically be faster than reifying Self. |
1650 | // |
1651 | // * Ideally I'd define the primitives as: |
1652 | // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. |
1653 | // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED |
1654 | // Unfortunately ADLC bugs prevent us from expressing the ideal form. |
1655 | // Instead, we're stuck with a rather awkward and brittle register assignments below. |
1656 | // Furthermore the register assignments are overconstrained, possibly resulting in |
1657 | // sub-optimal code near the synchronization site. |
1658 | // |
1659 | // * Eliminate the sp-proximity tests and just use "== Self" tests instead. |
1660 | // Alternately, use a better sp-proximity test. |
1661 | // |
1662 | // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. |
1663 | // Either one is sufficient to uniquely identify a thread. |
1664 | // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. |
1665 | // |
1666 | // * Intrinsify notify() and notifyAll() for the common cases where the |
1667 | // object is locked by the calling thread but the waitlist is empty. |
1668 | // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). |
1669 | // |
1670 | // * use jccb and jmpb instead of jcc and jmp to improve code density. |
1671 | // But beware of excessive branch density on AMD Opterons. |
1672 | // |
1673 | // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success |
1674 | // or failure of the fast-path. If the fast-path fails then we pass |
1675 | // control to the slow-path, typically in C. In Fast_Lock and |
1676 | // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 |
1677 | // will emit a conditional branch immediately after the node. |
1678 | // So we have branches to branches and lots of ICC.ZF games. |
1679 | // Instead, it might be better to have C2 pass a "FailureLabel" |
1680 | // into Fast_Lock and Fast_Unlock. In the case of success, control |
1681 | // will drop through the node. ICC.ZF is undefined at exit. |
1682 | // In the case of failure, the node will branch directly to the |
1683 | // FailureLabel |
1684 | |
1685 | |
1686 | // obj: object to lock |
1687 | // box: on-stack box address (displaced header location) - KILLED |
1688 | // rax,: tmp -- KILLED |
1689 | // scr: tmp -- KILLED |
1690 | void MacroAssembler::(Register objReg, Register boxReg, Register tmpReg, |
1691 | Register scrReg, Register cx1Reg, Register cx2Reg, |
1692 | BiasedLockingCounters* counters, |
1693 | RTMLockingCounters* rtm_counters, |
1694 | RTMLockingCounters* stack_rtm_counters, |
1695 | Metadata* method_data, |
1696 | bool use_rtm, bool profile_rtm) { |
1697 | // Ensure the register assignments are disjoint |
1698 | assert(tmpReg == rax, "" ); |
1699 | |
1700 | if (use_rtm) { |
1701 | assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); |
1702 | } else { |
1703 | assert(cx1Reg == noreg, "" ); |
1704 | assert(cx2Reg == noreg, "" ); |
1705 | assert_different_registers(objReg, boxReg, tmpReg, scrReg); |
1706 | } |
1707 | |
1708 | if (counters != NULL) { |
1709 | atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); |
1710 | } |
1711 | |
1712 | // Possible cases that we'll encounter in fast_lock |
1713 | // ------------------------------------------------ |
1714 | // * Inflated |
1715 | // -- unlocked |
1716 | // -- Locked |
1717 | // = by self |
1718 | // = by other |
1719 | // * biased |
1720 | // -- by Self |
1721 | // -- by other |
1722 | // * neutral |
1723 | // * stack-locked |
1724 | // -- by self |
1725 | // = sp-proximity test hits |
1726 | // = sp-proximity test generates false-negative |
1727 | // -- by other |
1728 | // |
1729 | |
1730 | Label IsInflated, DONE_LABEL; |
1731 | |
1732 | // it's stack-locked, biased or neutral |
1733 | // TODO: optimize away redundant LDs of obj->mark and improve the markword triage |
1734 | // order to reduce the number of conditional branches in the most common cases. |
1735 | // Beware -- there's a subtle invariant that fetch of the markword |
1736 | // at [FETCH], below, will never observe a biased encoding (*101b). |
1737 | // If this invariant is not held we risk exclusion (safety) failure. |
1738 | if (UseBiasedLocking && !UseOptoBiasInlining) { |
1739 | biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); |
1740 | } |
1741 | |
1742 | #if INCLUDE_RTM_OPT |
1743 | if (UseRTMForStackLocks && use_rtm) { |
1744 | rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, |
1745 | stack_rtm_counters, method_data, profile_rtm, |
1746 | DONE_LABEL, IsInflated); |
1747 | } |
1748 | #endif // INCLUDE_RTM_OPT |
1749 | |
1750 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] |
1751 | testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased |
1752 | jccb(Assembler::notZero, IsInflated); |
1753 | |
1754 | // Attempt stack-locking ... |
1755 | orptr (tmpReg, markOopDesc::unlocked_value); |
1756 | movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS |
1757 | lock(); |
1758 | cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg |
1759 | if (counters != NULL) { |
1760 | cond_inc32(Assembler::equal, |
1761 | ExternalAddress((address)counters->fast_path_entry_count_addr())); |
1762 | } |
1763 | jcc(Assembler::equal, DONE_LABEL); // Success |
1764 | |
1765 | // Recursive locking. |
1766 | // The object is stack-locked: markword contains stack pointer to BasicLock. |
1767 | // Locked by current thread if difference with current SP is less than one page. |
1768 | subptr(tmpReg, rsp); |
1769 | // Next instruction set ZFlag == 1 (Success) if difference is less then one page. |
1770 | andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); |
1771 | movptr(Address(boxReg, 0), tmpReg); |
1772 | if (counters != NULL) { |
1773 | cond_inc32(Assembler::equal, |
1774 | ExternalAddress((address)counters->fast_path_entry_count_addr())); |
1775 | } |
1776 | jmp(DONE_LABEL); |
1777 | |
1778 | bind(IsInflated); |
1779 | // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value |
1780 | |
1781 | #if INCLUDE_RTM_OPT |
1782 | // Use the same RTM locking code in 32- and 64-bit VM. |
1783 | if (use_rtm) { |
1784 | rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, |
1785 | rtm_counters, method_data, profile_rtm, DONE_LABEL); |
1786 | } else { |
1787 | #endif // INCLUDE_RTM_OPT |
1788 | |
1789 | #ifndef _LP64 |
1790 | // The object is inflated. |
1791 | |
1792 | // boxReg refers to the on-stack BasicLock in the current frame. |
1793 | // We'd like to write: |
1794 | // set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices. |
1795 | // This is convenient but results a ST-before-CAS penalty. The following CAS suffers |
1796 | // additional latency as we have another ST in the store buffer that must drain. |
1797 | |
1798 | // avoid ST-before-CAS |
1799 | // register juggle because we need tmpReg for cmpxchgptr below |
1800 | movptr(scrReg, boxReg); |
1801 | movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] |
1802 | |
1803 | // Optimistic form: consider XORL tmpReg,tmpReg |
1804 | movptr(tmpReg, NULL_WORD); |
1805 | |
1806 | // Appears unlocked - try to swing _owner from null to non-null. |
1807 | // Ideally, I'd manifest "Self" with get_thread and then attempt |
1808 | // to CAS the register containing Self into m->Owner. |
1809 | // But we don't have enough registers, so instead we can either try to CAS |
1810 | // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds |
1811 | // we later store "Self" into m->Owner. Transiently storing a stack address |
1812 | // (rsp or the address of the box) into m->owner is harmless. |
1813 | // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. |
1814 | lock(); |
1815 | cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); |
1816 | movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 |
1817 | // If we weren't able to swing _owner from NULL to the BasicLock |
1818 | // then take the slow path. |
1819 | jccb (Assembler::notZero, DONE_LABEL); |
1820 | // update _owner from BasicLock to thread |
1821 | get_thread (scrReg); // beware: clobbers ICCs |
1822 | movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); |
1823 | xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success |
1824 | |
1825 | // If the CAS fails we can either retry or pass control to the slow-path. |
1826 | // We use the latter tactic. |
1827 | // Pass the CAS result in the icc.ZFlag into DONE_LABEL |
1828 | // If the CAS was successful ... |
1829 | // Self has acquired the lock |
1830 | // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. |
1831 | // Intentional fall-through into DONE_LABEL ... |
1832 | #else // _LP64 |
1833 | // It's inflated |
1834 | movq(scrReg, tmpReg); |
1835 | xorq(tmpReg, tmpReg); |
1836 | |
1837 | lock(); |
1838 | cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); |
1839 | // Unconditionally set box->_displaced_header = markOopDesc::unused_mark(). |
1840 | // Without cast to int32_t movptr will destroy r10 which is typically obj. |
1841 | movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); |
1842 | // Intentional fall-through into DONE_LABEL ... |
1843 | // Propagate ICC.ZF from CAS above into DONE_LABEL. |
1844 | #endif // _LP64 |
1845 | #if INCLUDE_RTM_OPT |
1846 | } // use_rtm() |
1847 | #endif |
1848 | // DONE_LABEL is a hot target - we'd really like to place it at the |
1849 | // start of cache line by padding with NOPs. |
1850 | // See the AMD and Intel software optimization manuals for the |
1851 | // most efficient "long" NOP encodings. |
1852 | // Unfortunately none of our alignment mechanisms suffice. |
1853 | bind(DONE_LABEL); |
1854 | |
1855 | // At DONE_LABEL the icc ZFlag is set as follows ... |
1856 | // Fast_Unlock uses the same protocol. |
1857 | // ZFlag == 1 -> Success |
1858 | // ZFlag == 0 -> Failure - force control through the slow-path |
1859 | } |
1860 | |
1861 | // obj: object to unlock |
1862 | // box: box address (displaced header location), killed. Must be EAX. |
1863 | // tmp: killed, cannot be obj nor box. |
1864 | // |
1865 | // Some commentary on balanced locking: |
1866 | // |
1867 | // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. |
1868 | // Methods that don't have provably balanced locking are forced to run in the |
1869 | // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. |
1870 | // The interpreter provides two properties: |
1871 | // I1: At return-time the interpreter automatically and quietly unlocks any |
1872 | // objects acquired the current activation (frame). Recall that the |
1873 | // interpreter maintains an on-stack list of locks currently held by |
1874 | // a frame. |
1875 | // I2: If a method attempts to unlock an object that is not held by the |
1876 | // the frame the interpreter throws IMSX. |
1877 | // |
1878 | // Lets say A(), which has provably balanced locking, acquires O and then calls B(). |
1879 | // B() doesn't have provably balanced locking so it runs in the interpreter. |
1880 | // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O |
1881 | // is still locked by A(). |
1882 | // |
1883 | // The only other source of unbalanced locking would be JNI. The "Java Native Interface: |
1884 | // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter |
1885 | // should not be unlocked by "normal" java-level locking and vice-versa. The specification |
1886 | // doesn't specify what will occur if a program engages in such mixed-mode locking, however. |
1887 | // Arguably given that the spec legislates the JNI case as undefined our implementation |
1888 | // could reasonably *avoid* checking owner in Fast_Unlock(). |
1889 | // In the interest of performance we elide m->Owner==Self check in unlock. |
1890 | // A perfectly viable alternative is to elide the owner check except when |
1891 | // Xcheck:jni is enabled. |
1892 | |
1893 | void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { |
1894 | assert(boxReg == rax, "" ); |
1895 | assert_different_registers(objReg, boxReg, tmpReg); |
1896 | |
1897 | Label DONE_LABEL, Stacked, CheckSucc; |
1898 | |
1899 | // Critically, the biased locking test must have precedence over |
1900 | // and appear before the (box->dhw == 0) recursive stack-lock test. |
1901 | if (UseBiasedLocking && !UseOptoBiasInlining) { |
1902 | biased_locking_exit(objReg, tmpReg, DONE_LABEL); |
1903 | } |
1904 | |
1905 | #if INCLUDE_RTM_OPT |
1906 | if (UseRTMForStackLocks && use_rtm) { |
1907 | assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking" ); |
1908 | Label L_regular_unlock; |
1909 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword |
1910 | andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits |
1911 | cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked |
1912 | jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock |
1913 | xend(); // otherwise end... |
1914 | jmp(DONE_LABEL); // ... and we're done |
1915 | bind(L_regular_unlock); |
1916 | } |
1917 | #endif |
1918 | |
1919 | cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header |
1920 | jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock |
1921 | movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword |
1922 | testptr(tmpReg, markOopDesc::monitor_value); // Inflated? |
1923 | jccb (Assembler::zero, Stacked); |
1924 | |
1925 | // It's inflated. |
1926 | #if INCLUDE_RTM_OPT |
1927 | if (use_rtm) { |
1928 | Label L_regular_inflated_unlock; |
1929 | int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); |
1930 | movptr(boxReg, Address(tmpReg, owner_offset)); |
1931 | testptr(boxReg, boxReg); |
1932 | jccb(Assembler::notZero, L_regular_inflated_unlock); |
1933 | xend(); |
1934 | jmpb(DONE_LABEL); |
1935 | bind(L_regular_inflated_unlock); |
1936 | } |
1937 | #endif |
1938 | |
1939 | // Despite our balanced locking property we still check that m->_owner == Self |
1940 | // as java routines or native JNI code called by this thread might |
1941 | // have released the lock. |
1942 | // Refer to the comments in synchronizer.cpp for how we might encode extra |
1943 | // state in _succ so we can avoid fetching EntryList|cxq. |
1944 | // |
1945 | // I'd like to add more cases in fast_lock() and fast_unlock() -- |
1946 | // such as recursive enter and exit -- but we have to be wary of |
1947 | // I$ bloat, T$ effects and BP$ effects. |
1948 | // |
1949 | // If there's no contention try a 1-0 exit. That is, exit without |
1950 | // a costly MEMBAR or CAS. See synchronizer.cpp for details on how |
1951 | // we detect and recover from the race that the 1-0 exit admits. |
1952 | // |
1953 | // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier |
1954 | // before it STs null into _owner, releasing the lock. Updates |
1955 | // to data protected by the critical section must be visible before |
1956 | // we drop the lock (and thus before any other thread could acquire |
1957 | // the lock and observe the fields protected by the lock). |
1958 | // IA32's memory-model is SPO, so STs are ordered with respect to |
1959 | // each other and there's no need for an explicit barrier (fence). |
1960 | // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. |
1961 | #ifndef _LP64 |
1962 | get_thread (boxReg); |
1963 | |
1964 | // Note that we could employ various encoding schemes to reduce |
1965 | // the number of loads below (currently 4) to just 2 or 3. |
1966 | // Refer to the comments in synchronizer.cpp. |
1967 | // In practice the chain of fetches doesn't seem to impact performance, however. |
1968 | xorptr(boxReg, boxReg); |
1969 | orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); |
1970 | jccb (Assembler::notZero, DONE_LABEL); |
1971 | movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); |
1972 | orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); |
1973 | jccb (Assembler::notZero, CheckSucc); |
1974 | movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); |
1975 | jmpb (DONE_LABEL); |
1976 | |
1977 | bind (Stacked); |
1978 | // It's not inflated and it's not recursively stack-locked and it's not biased. |
1979 | // It must be stack-locked. |
1980 | // Try to reset the header to displaced header. |
1981 | // The "box" value on the stack is stable, so we can reload |
1982 | // and be assured we observe the same value as above. |
1983 | movptr(tmpReg, Address(boxReg, 0)); |
1984 | lock(); |
1985 | cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box |
1986 | // Intention fall-thru into DONE_LABEL |
1987 | |
1988 | // DONE_LABEL is a hot target - we'd really like to place it at the |
1989 | // start of cache line by padding with NOPs. |
1990 | // See the AMD and Intel software optimization manuals for the |
1991 | // most efficient "long" NOP encodings. |
1992 | // Unfortunately none of our alignment mechanisms suffice. |
1993 | bind (CheckSucc); |
1994 | #else // _LP64 |
1995 | // It's inflated |
1996 | xorptr(boxReg, boxReg); |
1997 | orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); |
1998 | jccb (Assembler::notZero, DONE_LABEL); |
1999 | movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); |
2000 | orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); |
2001 | jccb (Assembler::notZero, CheckSucc); |
2002 | movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); |
2003 | jmpb (DONE_LABEL); |
2004 | |
2005 | // Try to avoid passing control into the slow_path ... |
2006 | Label LSuccess, LGoSlowPath ; |
2007 | bind (CheckSucc); |
2008 | |
2009 | // The following optional optimization can be elided if necessary |
2010 | // Effectively: if (succ == null) goto SlowPath |
2011 | // The code reduces the window for a race, however, |
2012 | // and thus benefits performance. |
2013 | cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); |
2014 | jccb (Assembler::zero, LGoSlowPath); |
2015 | |
2016 | xorptr(boxReg, boxReg); |
2017 | movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); |
2018 | |
2019 | // Memory barrier/fence |
2020 | // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ |
2021 | // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. |
2022 | // This is faster on Nehalem and AMD Shanghai/Barcelona. |
2023 | // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences |
2024 | // We might also restructure (ST Owner=0;barrier;LD _Succ) to |
2025 | // (mov box,0; xchgq box, &m->Owner; LD _succ) . |
2026 | lock(); addl(Address(rsp, 0), 0); |
2027 | |
2028 | cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); |
2029 | jccb (Assembler::notZero, LSuccess); |
2030 | |
2031 | // Rare inopportune interleaving - race. |
2032 | // The successor vanished in the small window above. |
2033 | // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. |
2034 | // We need to ensure progress and succession. |
2035 | // Try to reacquire the lock. |
2036 | // If that fails then the new owner is responsible for succession and this |
2037 | // thread needs to take no further action and can exit via the fast path (success). |
2038 | // If the re-acquire succeeds then pass control into the slow path. |
2039 | // As implemented, this latter mode is horrible because we generated more |
2040 | // coherence traffic on the lock *and* artifically extended the critical section |
2041 | // length while by virtue of passing control into the slow path. |
2042 | |
2043 | // box is really RAX -- the following CMPXCHG depends on that binding |
2044 | // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) |
2045 | lock(); |
2046 | cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); |
2047 | // There's no successor so we tried to regrab the lock. |
2048 | // If that didn't work, then another thread grabbed the |
2049 | // lock so we're done (and exit was a success). |
2050 | jccb (Assembler::notEqual, LSuccess); |
2051 | // Intentional fall-through into slow-path |
2052 | |
2053 | bind (LGoSlowPath); |
2054 | orl (boxReg, 1); // set ICC.ZF=0 to indicate failure |
2055 | jmpb (DONE_LABEL); |
2056 | |
2057 | bind (LSuccess); |
2058 | testl (boxReg, 0); // set ICC.ZF=1 to indicate success |
2059 | jmpb (DONE_LABEL); |
2060 | |
2061 | bind (Stacked); |
2062 | movptr(tmpReg, Address (boxReg, 0)); // re-fetch |
2063 | lock(); |
2064 | cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box |
2065 | |
2066 | #endif |
2067 | bind(DONE_LABEL); |
2068 | } |
2069 | #endif // COMPILER2 |
2070 | |
2071 | void MacroAssembler::c2bool(Register x) { |
2072 | // implements x == 0 ? 0 : 1 |
2073 | // note: must only look at least-significant byte of x |
2074 | // since C-style booleans are stored in one byte |
2075 | // only! (was bug) |
2076 | andl(x, 0xFF); |
2077 | setb(Assembler::notZero, x); |
2078 | } |
2079 | |
2080 | // Wouldn't need if AddressLiteral version had new name |
2081 | void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { |
2082 | Assembler::call(L, rtype); |
2083 | } |
2084 | |
2085 | void MacroAssembler::call(Register entry) { |
2086 | Assembler::call(entry); |
2087 | } |
2088 | |
2089 | void MacroAssembler::call(AddressLiteral entry) { |
2090 | if (reachable(entry)) { |
2091 | Assembler::call_literal(entry.target(), entry.rspec()); |
2092 | } else { |
2093 | lea(rscratch1, entry); |
2094 | Assembler::call(rscratch1); |
2095 | } |
2096 | } |
2097 | |
2098 | void MacroAssembler::ic_call(address entry, jint method_index) { |
2099 | RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); |
2100 | movptr(rax, (intptr_t)Universe::non_oop_word()); |
2101 | call(AddressLiteral(entry, rh)); |
2102 | } |
2103 | |
2104 | // Implementation of call_VM versions |
2105 | |
2106 | void MacroAssembler::call_VM(Register oop_result, |
2107 | address entry_point, |
2108 | bool check_exceptions) { |
2109 | Label C, E; |
2110 | call(C, relocInfo::none); |
2111 | jmp(E); |
2112 | |
2113 | bind(C); |
2114 | call_VM_helper(oop_result, entry_point, 0, check_exceptions); |
2115 | ret(0); |
2116 | |
2117 | bind(E); |
2118 | } |
2119 | |
2120 | void MacroAssembler::call_VM(Register oop_result, |
2121 | address entry_point, |
2122 | Register arg_1, |
2123 | bool check_exceptions) { |
2124 | Label C, E; |
2125 | call(C, relocInfo::none); |
2126 | jmp(E); |
2127 | |
2128 | bind(C); |
2129 | pass_arg1(this, arg_1); |
2130 | call_VM_helper(oop_result, entry_point, 1, check_exceptions); |
2131 | ret(0); |
2132 | |
2133 | bind(E); |
2134 | } |
2135 | |
2136 | void MacroAssembler::call_VM(Register oop_result, |
2137 | address entry_point, |
2138 | Register arg_1, |
2139 | Register arg_2, |
2140 | bool check_exceptions) { |
2141 | Label C, E; |
2142 | call(C, relocInfo::none); |
2143 | jmp(E); |
2144 | |
2145 | bind(C); |
2146 | |
2147 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2148 | |
2149 | pass_arg2(this, arg_2); |
2150 | pass_arg1(this, arg_1); |
2151 | call_VM_helper(oop_result, entry_point, 2, check_exceptions); |
2152 | ret(0); |
2153 | |
2154 | bind(E); |
2155 | } |
2156 | |
2157 | void MacroAssembler::call_VM(Register oop_result, |
2158 | address entry_point, |
2159 | Register arg_1, |
2160 | Register arg_2, |
2161 | Register arg_3, |
2162 | bool check_exceptions) { |
2163 | Label C, E; |
2164 | call(C, relocInfo::none); |
2165 | jmp(E); |
2166 | |
2167 | bind(C); |
2168 | |
2169 | LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg" )); |
2170 | LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg" )); |
2171 | pass_arg3(this, arg_3); |
2172 | |
2173 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2174 | pass_arg2(this, arg_2); |
2175 | |
2176 | pass_arg1(this, arg_1); |
2177 | call_VM_helper(oop_result, entry_point, 3, check_exceptions); |
2178 | ret(0); |
2179 | |
2180 | bind(E); |
2181 | } |
2182 | |
2183 | void MacroAssembler::call_VM(Register oop_result, |
2184 | Register last_java_sp, |
2185 | address entry_point, |
2186 | int number_of_arguments, |
2187 | bool check_exceptions) { |
2188 | Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); |
2189 | call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); |
2190 | } |
2191 | |
2192 | void MacroAssembler::call_VM(Register oop_result, |
2193 | Register last_java_sp, |
2194 | address entry_point, |
2195 | Register arg_1, |
2196 | bool check_exceptions) { |
2197 | pass_arg1(this, arg_1); |
2198 | call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); |
2199 | } |
2200 | |
2201 | void MacroAssembler::call_VM(Register oop_result, |
2202 | Register last_java_sp, |
2203 | address entry_point, |
2204 | Register arg_1, |
2205 | Register arg_2, |
2206 | bool check_exceptions) { |
2207 | |
2208 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2209 | pass_arg2(this, arg_2); |
2210 | pass_arg1(this, arg_1); |
2211 | call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); |
2212 | } |
2213 | |
2214 | void MacroAssembler::call_VM(Register oop_result, |
2215 | Register last_java_sp, |
2216 | address entry_point, |
2217 | Register arg_1, |
2218 | Register arg_2, |
2219 | Register arg_3, |
2220 | bool check_exceptions) { |
2221 | LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg" )); |
2222 | LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg" )); |
2223 | pass_arg3(this, arg_3); |
2224 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2225 | pass_arg2(this, arg_2); |
2226 | pass_arg1(this, arg_1); |
2227 | call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); |
2228 | } |
2229 | |
2230 | void MacroAssembler::super_call_VM(Register oop_result, |
2231 | Register last_java_sp, |
2232 | address entry_point, |
2233 | int number_of_arguments, |
2234 | bool check_exceptions) { |
2235 | Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); |
2236 | MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); |
2237 | } |
2238 | |
2239 | void MacroAssembler::super_call_VM(Register oop_result, |
2240 | Register last_java_sp, |
2241 | address entry_point, |
2242 | Register arg_1, |
2243 | bool check_exceptions) { |
2244 | pass_arg1(this, arg_1); |
2245 | super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); |
2246 | } |
2247 | |
2248 | void MacroAssembler::super_call_VM(Register oop_result, |
2249 | Register last_java_sp, |
2250 | address entry_point, |
2251 | Register arg_1, |
2252 | Register arg_2, |
2253 | bool check_exceptions) { |
2254 | |
2255 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2256 | pass_arg2(this, arg_2); |
2257 | pass_arg1(this, arg_1); |
2258 | super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); |
2259 | } |
2260 | |
2261 | void MacroAssembler::super_call_VM(Register oop_result, |
2262 | Register last_java_sp, |
2263 | address entry_point, |
2264 | Register arg_1, |
2265 | Register arg_2, |
2266 | Register arg_3, |
2267 | bool check_exceptions) { |
2268 | LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg" )); |
2269 | LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg" )); |
2270 | pass_arg3(this, arg_3); |
2271 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2272 | pass_arg2(this, arg_2); |
2273 | pass_arg1(this, arg_1); |
2274 | super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); |
2275 | } |
2276 | |
2277 | void MacroAssembler::call_VM_base(Register oop_result, |
2278 | Register java_thread, |
2279 | Register last_java_sp, |
2280 | address entry_point, |
2281 | int number_of_arguments, |
2282 | bool check_exceptions) { |
2283 | // determine java_thread register |
2284 | if (!java_thread->is_valid()) { |
2285 | #ifdef _LP64 |
2286 | java_thread = r15_thread; |
2287 | #else |
2288 | java_thread = rdi; |
2289 | get_thread(java_thread); |
2290 | #endif // LP64 |
2291 | } |
2292 | // determine last_java_sp register |
2293 | if (!last_java_sp->is_valid()) { |
2294 | last_java_sp = rsp; |
2295 | } |
2296 | // debugging support |
2297 | assert(number_of_arguments >= 0 , "cannot have negative number of arguments" ); |
2298 | LP64_ONLY(assert(java_thread == r15_thread, "unexpected register" )); |
2299 | #ifdef ASSERT |
2300 | // TraceBytecodes does not use r12 but saves it over the call, so don't verify |
2301 | // r12 is the heapbase. |
2302 | LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?" );) |
2303 | #endif // ASSERT |
2304 | |
2305 | assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result" ); |
2306 | assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp" ); |
2307 | |
2308 | // push java thread (becomes first argument of C function) |
2309 | |
2310 | NOT_LP64(push(java_thread); number_of_arguments++); |
2311 | LP64_ONLY(mov(c_rarg0, r15_thread)); |
2312 | |
2313 | // set last Java frame before call |
2314 | assert(last_java_sp != rbp, "can't use ebp/rbp" ); |
2315 | |
2316 | // Only interpreter should have to set fp |
2317 | set_last_Java_frame(java_thread, last_java_sp, rbp, NULL); |
2318 | |
2319 | // do the call, remove parameters |
2320 | MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); |
2321 | |
2322 | // restore the thread (cannot use the pushed argument since arguments |
2323 | // may be overwritten by C code generated by an optimizing compiler); |
2324 | // however can use the register value directly if it is callee saved. |
2325 | if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { |
2326 | // rdi & rsi (also r15) are callee saved -> nothing to do |
2327 | #ifdef ASSERT |
2328 | guarantee(java_thread != rax, "change this code" ); |
2329 | push(rax); |
2330 | { Label L; |
2331 | get_thread(rax); |
2332 | cmpptr(java_thread, rax); |
2333 | jcc(Assembler::equal, L); |
2334 | STOP("MacroAssembler::call_VM_base: rdi not callee saved?" ); |
2335 | bind(L); |
2336 | } |
2337 | pop(rax); |
2338 | #endif |
2339 | } else { |
2340 | get_thread(java_thread); |
2341 | } |
2342 | // reset last Java frame |
2343 | // Only interpreter should have to clear fp |
2344 | reset_last_Java_frame(java_thread, true); |
2345 | |
2346 | // C++ interp handles this in the interpreter |
2347 | check_and_handle_popframe(java_thread); |
2348 | check_and_handle_earlyret(java_thread); |
2349 | |
2350 | if (check_exceptions) { |
2351 | // check for pending exceptions (java_thread is set upon return) |
2352 | cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD); |
2353 | #ifndef _LP64 |
2354 | jump_cc(Assembler::notEqual, |
2355 | RuntimeAddress(StubRoutines::forward_exception_entry())); |
2356 | #else |
2357 | // This used to conditionally jump to forward_exception however it is |
2358 | // possible if we relocate that the branch will not reach. So we must jump |
2359 | // around so we can always reach |
2360 | |
2361 | Label ok; |
2362 | jcc(Assembler::equal, ok); |
2363 | jump(RuntimeAddress(StubRoutines::forward_exception_entry())); |
2364 | bind(ok); |
2365 | #endif // LP64 |
2366 | } |
2367 | |
2368 | // get oop result if there is one and reset the value in the thread |
2369 | if (oop_result->is_valid()) { |
2370 | get_vm_result(oop_result, java_thread); |
2371 | } |
2372 | } |
2373 | |
2374 | void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { |
2375 | |
2376 | // Calculate the value for last_Java_sp |
2377 | // somewhat subtle. call_VM does an intermediate call |
2378 | // which places a return address on the stack just under the |
2379 | // stack pointer as the user finsihed with it. This allows |
2380 | // use to retrieve last_Java_pc from last_Java_sp[-1]. |
2381 | // On 32bit we then have to push additional args on the stack to accomplish |
2382 | // the actual requested call. On 64bit call_VM only can use register args |
2383 | // so the only extra space is the return address that call_VM created. |
2384 | // This hopefully explains the calculations here. |
2385 | |
2386 | #ifdef _LP64 |
2387 | // We've pushed one address, correct last_Java_sp |
2388 | lea(rax, Address(rsp, wordSize)); |
2389 | #else |
2390 | lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); |
2391 | #endif // LP64 |
2392 | |
2393 | call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); |
2394 | |
2395 | } |
2396 | |
2397 | // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. |
2398 | void MacroAssembler::call_VM_leaf0(address entry_point) { |
2399 | MacroAssembler::call_VM_leaf_base(entry_point, 0); |
2400 | } |
2401 | |
2402 | void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { |
2403 | call_VM_leaf_base(entry_point, number_of_arguments); |
2404 | } |
2405 | |
2406 | void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { |
2407 | pass_arg0(this, arg_0); |
2408 | call_VM_leaf(entry_point, 1); |
2409 | } |
2410 | |
2411 | void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { |
2412 | |
2413 | LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg" )); |
2414 | pass_arg1(this, arg_1); |
2415 | pass_arg0(this, arg_0); |
2416 | call_VM_leaf(entry_point, 2); |
2417 | } |
2418 | |
2419 | void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { |
2420 | LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg" )); |
2421 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2422 | pass_arg2(this, arg_2); |
2423 | LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg" )); |
2424 | pass_arg1(this, arg_1); |
2425 | pass_arg0(this, arg_0); |
2426 | call_VM_leaf(entry_point, 3); |
2427 | } |
2428 | |
2429 | void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { |
2430 | pass_arg0(this, arg_0); |
2431 | MacroAssembler::call_VM_leaf_base(entry_point, 1); |
2432 | } |
2433 | |
2434 | void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { |
2435 | |
2436 | LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg" )); |
2437 | pass_arg1(this, arg_1); |
2438 | pass_arg0(this, arg_0); |
2439 | MacroAssembler::call_VM_leaf_base(entry_point, 2); |
2440 | } |
2441 | |
2442 | void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { |
2443 | LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg" )); |
2444 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2445 | pass_arg2(this, arg_2); |
2446 | LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg" )); |
2447 | pass_arg1(this, arg_1); |
2448 | pass_arg0(this, arg_0); |
2449 | MacroAssembler::call_VM_leaf_base(entry_point, 3); |
2450 | } |
2451 | |
2452 | void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { |
2453 | LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg" )); |
2454 | LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg" )); |
2455 | LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg" )); |
2456 | pass_arg3(this, arg_3); |
2457 | LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg" )); |
2458 | LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg" )); |
2459 | pass_arg2(this, arg_2); |
2460 | LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg" )); |
2461 | pass_arg1(this, arg_1); |
2462 | pass_arg0(this, arg_0); |
2463 | MacroAssembler::call_VM_leaf_base(entry_point, 4); |
2464 | } |
2465 | |
2466 | void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { |
2467 | movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); |
2468 | movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); |
2469 | verify_oop(oop_result, "broken oop in call_VM_base" ); |
2470 | } |
2471 | |
2472 | void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { |
2473 | movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); |
2474 | movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); |
2475 | } |
2476 | |
2477 | void MacroAssembler::check_and_handle_earlyret(Register java_thread) { |
2478 | } |
2479 | |
2480 | void MacroAssembler::check_and_handle_popframe(Register java_thread) { |
2481 | } |
2482 | |
2483 | void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) { |
2484 | if (reachable(src1)) { |
2485 | cmpl(as_Address(src1), imm); |
2486 | } else { |
2487 | lea(rscratch1, src1); |
2488 | cmpl(Address(rscratch1, 0), imm); |
2489 | } |
2490 | } |
2491 | |
2492 | void MacroAssembler::cmp32(Register src1, AddressLiteral src2) { |
2493 | assert(!src2.is_lval(), "use cmpptr" ); |
2494 | if (reachable(src2)) { |
2495 | cmpl(src1, as_Address(src2)); |
2496 | } else { |
2497 | lea(rscratch1, src2); |
2498 | cmpl(src1, Address(rscratch1, 0)); |
2499 | } |
2500 | } |
2501 | |
2502 | void MacroAssembler::cmp32(Register src1, int32_t imm) { |
2503 | Assembler::cmpl(src1, imm); |
2504 | } |
2505 | |
2506 | void MacroAssembler::cmp32(Register src1, Address src2) { |
2507 | Assembler::cmpl(src1, src2); |
2508 | } |
2509 | |
2510 | void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { |
2511 | ucomisd(opr1, opr2); |
2512 | |
2513 | Label L; |
2514 | if (unordered_is_less) { |
2515 | movl(dst, -1); |
2516 | jcc(Assembler::parity, L); |
2517 | jcc(Assembler::below , L); |
2518 | movl(dst, 0); |
2519 | jcc(Assembler::equal , L); |
2520 | increment(dst); |
2521 | } else { // unordered is greater |
2522 | movl(dst, 1); |
2523 | jcc(Assembler::parity, L); |
2524 | jcc(Assembler::above , L); |
2525 | movl(dst, 0); |
2526 | jcc(Assembler::equal , L); |
2527 | decrementl(dst); |
2528 | } |
2529 | bind(L); |
2530 | } |
2531 | |
2532 | void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { |
2533 | ucomiss(opr1, opr2); |
2534 | |
2535 | Label L; |
2536 | if (unordered_is_less) { |
2537 | movl(dst, -1); |
2538 | jcc(Assembler::parity, L); |
2539 | jcc(Assembler::below , L); |
2540 | movl(dst, 0); |
2541 | jcc(Assembler::equal , L); |
2542 | increment(dst); |
2543 | } else { // unordered is greater |
2544 | movl(dst, 1); |
2545 | jcc(Assembler::parity, L); |
2546 | jcc(Assembler::above , L); |
2547 | movl(dst, 0); |
2548 | jcc(Assembler::equal , L); |
2549 | decrementl(dst); |
2550 | } |
2551 | bind(L); |
2552 | } |
2553 | |
2554 | |
2555 | void MacroAssembler::cmp8(AddressLiteral src1, int imm) { |
2556 | if (reachable(src1)) { |
2557 | cmpb(as_Address(src1), imm); |
2558 | } else { |
2559 | lea(rscratch1, src1); |
2560 | cmpb(Address(rscratch1, 0), imm); |
2561 | } |
2562 | } |
2563 | |
2564 | void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) { |
2565 | #ifdef _LP64 |
2566 | if (src2.is_lval()) { |
2567 | movptr(rscratch1, src2); |
2568 | Assembler::cmpq(src1, rscratch1); |
2569 | } else if (reachable(src2)) { |
2570 | cmpq(src1, as_Address(src2)); |
2571 | } else { |
2572 | lea(rscratch1, src2); |
2573 | Assembler::cmpq(src1, Address(rscratch1, 0)); |
2574 | } |
2575 | #else |
2576 | if (src2.is_lval()) { |
2577 | cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); |
2578 | } else { |
2579 | cmpl(src1, as_Address(src2)); |
2580 | } |
2581 | #endif // _LP64 |
2582 | } |
2583 | |
2584 | void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) { |
2585 | assert(src2.is_lval(), "not a mem-mem compare" ); |
2586 | #ifdef _LP64 |
2587 | // moves src2's literal address |
2588 | movptr(rscratch1, src2); |
2589 | Assembler::cmpq(src1, rscratch1); |
2590 | #else |
2591 | cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); |
2592 | #endif // _LP64 |
2593 | } |
2594 | |
2595 | void MacroAssembler::cmpoop(Register src1, Register src2) { |
2596 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2597 | bs->obj_equals(this, src1, src2); |
2598 | } |
2599 | |
2600 | void MacroAssembler::cmpoop(Register src1, Address src2) { |
2601 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2602 | bs->obj_equals(this, src1, src2); |
2603 | } |
2604 | |
2605 | #ifdef _LP64 |
2606 | void MacroAssembler::cmpoop(Register src1, jobject src2) { |
2607 | movoop(rscratch1, src2); |
2608 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
2609 | bs->obj_equals(this, src1, rscratch1); |
2610 | } |
2611 | #endif |
2612 | |
2613 | void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) { |
2614 | if (reachable(adr)) { |
2615 | lock(); |
2616 | cmpxchgptr(reg, as_Address(adr)); |
2617 | } else { |
2618 | lea(rscratch1, adr); |
2619 | lock(); |
2620 | cmpxchgptr(reg, Address(rscratch1, 0)); |
2621 | } |
2622 | } |
2623 | |
2624 | void MacroAssembler::cmpxchgptr(Register reg, Address adr) { |
2625 | LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); |
2626 | } |
2627 | |
2628 | void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) { |
2629 | if (reachable(src)) { |
2630 | Assembler::comisd(dst, as_Address(src)); |
2631 | } else { |
2632 | lea(rscratch1, src); |
2633 | Assembler::comisd(dst, Address(rscratch1, 0)); |
2634 | } |
2635 | } |
2636 | |
2637 | void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) { |
2638 | if (reachable(src)) { |
2639 | Assembler::comiss(dst, as_Address(src)); |
2640 | } else { |
2641 | lea(rscratch1, src); |
2642 | Assembler::comiss(dst, Address(rscratch1, 0)); |
2643 | } |
2644 | } |
2645 | |
2646 | |
2647 | void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) { |
2648 | Condition negated_cond = negate_condition(cond); |
2649 | Label L; |
2650 | jcc(negated_cond, L); |
2651 | pushf(); // Preserve flags |
2652 | atomic_incl(counter_addr); |
2653 | popf(); |
2654 | bind(L); |
2655 | } |
2656 | |
2657 | int MacroAssembler::corrected_idivl(Register reg) { |
2658 | // Full implementation of Java idiv and irem; checks for |
2659 | // special case as described in JVM spec., p.243 & p.271. |
2660 | // The function returns the (pc) offset of the idivl |
2661 | // instruction - may be needed for implicit exceptions. |
2662 | // |
2663 | // normal case special case |
2664 | // |
2665 | // input : rax,: dividend min_int |
2666 | // reg: divisor (may not be rax,/rdx) -1 |
2667 | // |
2668 | // output: rax,: quotient (= rax, idiv reg) min_int |
2669 | // rdx: remainder (= rax, irem reg) 0 |
2670 | assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register" ); |
2671 | const int min_int = 0x80000000; |
2672 | Label normal_case, special_case; |
2673 | |
2674 | // check for special case |
2675 | cmpl(rax, min_int); |
2676 | jcc(Assembler::notEqual, normal_case); |
2677 | xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) |
2678 | cmpl(reg, -1); |
2679 | jcc(Assembler::equal, special_case); |
2680 | |
2681 | // handle normal case |
2682 | bind(normal_case); |
2683 | cdql(); |
2684 | int idivl_offset = offset(); |
2685 | idivl(reg); |
2686 | |
2687 | // normal and special case exit |
2688 | bind(special_case); |
2689 | |
2690 | return idivl_offset; |
2691 | } |
2692 | |
2693 | |
2694 | |
2695 | void MacroAssembler::decrementl(Register reg, int value) { |
2696 | if (value == min_jint) {subl(reg, value) ; return; } |
2697 | if (value < 0) { incrementl(reg, -value); return; } |
2698 | if (value == 0) { ; return; } |
2699 | if (value == 1 && UseIncDec) { decl(reg) ; return; } |
2700 | /* else */ { subl(reg, value) ; return; } |
2701 | } |
2702 | |
2703 | void MacroAssembler::decrementl(Address dst, int value) { |
2704 | if (value == min_jint) {subl(dst, value) ; return; } |
2705 | if (value < 0) { incrementl(dst, -value); return; } |
2706 | if (value == 0) { ; return; } |
2707 | if (value == 1 && UseIncDec) { decl(dst) ; return; } |
2708 | /* else */ { subl(dst, value) ; return; } |
2709 | } |
2710 | |
2711 | void MacroAssembler::division_with_shift (Register reg, int shift_value) { |
2712 | assert (shift_value > 0, "illegal shift value" ); |
2713 | Label _is_positive; |
2714 | testl (reg, reg); |
2715 | jcc (Assembler::positive, _is_positive); |
2716 | int offset = (1 << shift_value) - 1 ; |
2717 | |
2718 | if (offset == 1) { |
2719 | incrementl(reg); |
2720 | } else { |
2721 | addl(reg, offset); |
2722 | } |
2723 | |
2724 | bind (_is_positive); |
2725 | sarl(reg, shift_value); |
2726 | } |
2727 | |
2728 | void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) { |
2729 | if (reachable(src)) { |
2730 | Assembler::divsd(dst, as_Address(src)); |
2731 | } else { |
2732 | lea(rscratch1, src); |
2733 | Assembler::divsd(dst, Address(rscratch1, 0)); |
2734 | } |
2735 | } |
2736 | |
2737 | void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) { |
2738 | if (reachable(src)) { |
2739 | Assembler::divss(dst, as_Address(src)); |
2740 | } else { |
2741 | lea(rscratch1, src); |
2742 | Assembler::divss(dst, Address(rscratch1, 0)); |
2743 | } |
2744 | } |
2745 | |
2746 | // !defined(COMPILER2) is because of stupid core builds |
2747 | #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI |
2748 | void MacroAssembler::empty_FPU_stack() { |
2749 | if (VM_Version::supports_mmx()) { |
2750 | emms(); |
2751 | } else { |
2752 | for (int i = 8; i-- > 0; ) ffree(i); |
2753 | } |
2754 | } |
2755 | #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI |
2756 | |
2757 | |
2758 | void MacroAssembler::enter() { |
2759 | push(rbp); |
2760 | mov(rbp, rsp); |
2761 | } |
2762 | |
2763 | // A 5 byte nop that is safe for patching (see patch_verified_entry) |
2764 | void MacroAssembler::fat_nop() { |
2765 | if (UseAddressNop) { |
2766 | addr_nop_5(); |
2767 | } else { |
2768 | emit_int8(0x26); // es: |
2769 | emit_int8(0x2e); // cs: |
2770 | emit_int8(0x64); // fs: |
2771 | emit_int8(0x65); // gs: |
2772 | emit_int8((unsigned char)0x90); |
2773 | } |
2774 | } |
2775 | |
2776 | void MacroAssembler::fcmp(Register tmp) { |
2777 | fcmp(tmp, 1, true, true); |
2778 | } |
2779 | |
2780 | void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { |
2781 | assert(!pop_right || pop_left, "usage error" ); |
2782 | if (VM_Version::supports_cmov()) { |
2783 | assert(tmp == noreg, "unneeded temp" ); |
2784 | if (pop_left) { |
2785 | fucomip(index); |
2786 | } else { |
2787 | fucomi(index); |
2788 | } |
2789 | if (pop_right) { |
2790 | fpop(); |
2791 | } |
2792 | } else { |
2793 | assert(tmp != noreg, "need temp" ); |
2794 | if (pop_left) { |
2795 | if (pop_right) { |
2796 | fcompp(); |
2797 | } else { |
2798 | fcomp(index); |
2799 | } |
2800 | } else { |
2801 | fcom(index); |
2802 | } |
2803 | // convert FPU condition into eflags condition via rax, |
2804 | save_rax(tmp); |
2805 | fwait(); fnstsw_ax(); |
2806 | sahf(); |
2807 | restore_rax(tmp); |
2808 | } |
2809 | // condition codes set as follows: |
2810 | // |
2811 | // CF (corresponds to C0) if x < y |
2812 | // PF (corresponds to C2) if unordered |
2813 | // ZF (corresponds to C3) if x = y |
2814 | } |
2815 | |
2816 | void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { |
2817 | fcmp2int(dst, unordered_is_less, 1, true, true); |
2818 | } |
2819 | |
2820 | void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { |
2821 | fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); |
2822 | Label L; |
2823 | if (unordered_is_less) { |
2824 | movl(dst, -1); |
2825 | jcc(Assembler::parity, L); |
2826 | jcc(Assembler::below , L); |
2827 | movl(dst, 0); |
2828 | jcc(Assembler::equal , L); |
2829 | increment(dst); |
2830 | } else { // unordered is greater |
2831 | movl(dst, 1); |
2832 | jcc(Assembler::parity, L); |
2833 | jcc(Assembler::above , L); |
2834 | movl(dst, 0); |
2835 | jcc(Assembler::equal , L); |
2836 | decrementl(dst); |
2837 | } |
2838 | bind(L); |
2839 | } |
2840 | |
2841 | void MacroAssembler::fld_d(AddressLiteral src) { |
2842 | fld_d(as_Address(src)); |
2843 | } |
2844 | |
2845 | void MacroAssembler::fld_s(AddressLiteral src) { |
2846 | fld_s(as_Address(src)); |
2847 | } |
2848 | |
2849 | void MacroAssembler::fld_x(AddressLiteral src) { |
2850 | Assembler::fld_x(as_Address(src)); |
2851 | } |
2852 | |
2853 | void MacroAssembler::fldcw(AddressLiteral src) { |
2854 | Assembler::fldcw(as_Address(src)); |
2855 | } |
2856 | |
2857 | void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) { |
2858 | if (reachable(src)) { |
2859 | Assembler::mulpd(dst, as_Address(src)); |
2860 | } else { |
2861 | lea(rscratch1, src); |
2862 | Assembler::mulpd(dst, Address(rscratch1, 0)); |
2863 | } |
2864 | } |
2865 | |
2866 | void MacroAssembler::increase_precision() { |
2867 | subptr(rsp, BytesPerWord); |
2868 | fnstcw(Address(rsp, 0)); |
2869 | movl(rax, Address(rsp, 0)); |
2870 | orl(rax, 0x300); |
2871 | push(rax); |
2872 | fldcw(Address(rsp, 0)); |
2873 | pop(rax); |
2874 | } |
2875 | |
2876 | void MacroAssembler::restore_precision() { |
2877 | fldcw(Address(rsp, 0)); |
2878 | addptr(rsp, BytesPerWord); |
2879 | } |
2880 | |
2881 | void MacroAssembler::fpop() { |
2882 | ffree(); |
2883 | fincstp(); |
2884 | } |
2885 | |
2886 | void MacroAssembler::load_float(Address src) { |
2887 | if (UseSSE >= 1) { |
2888 | movflt(xmm0, src); |
2889 | } else { |
2890 | LP64_ONLY(ShouldNotReachHere()); |
2891 | NOT_LP64(fld_s(src)); |
2892 | } |
2893 | } |
2894 | |
2895 | void MacroAssembler::store_float(Address dst) { |
2896 | if (UseSSE >= 1) { |
2897 | movflt(dst, xmm0); |
2898 | } else { |
2899 | LP64_ONLY(ShouldNotReachHere()); |
2900 | NOT_LP64(fstp_s(dst)); |
2901 | } |
2902 | } |
2903 | |
2904 | void MacroAssembler::load_double(Address src) { |
2905 | if (UseSSE >= 2) { |
2906 | movdbl(xmm0, src); |
2907 | } else { |
2908 | LP64_ONLY(ShouldNotReachHere()); |
2909 | NOT_LP64(fld_d(src)); |
2910 | } |
2911 | } |
2912 | |
2913 | void MacroAssembler::store_double(Address dst) { |
2914 | if (UseSSE >= 2) { |
2915 | movdbl(dst, xmm0); |
2916 | } else { |
2917 | LP64_ONLY(ShouldNotReachHere()); |
2918 | NOT_LP64(fstp_d(dst)); |
2919 | } |
2920 | } |
2921 | |
2922 | void MacroAssembler::fremr(Register tmp) { |
2923 | save_rax(tmp); |
2924 | { Label L; |
2925 | bind(L); |
2926 | fprem(); |
2927 | fwait(); fnstsw_ax(); |
2928 | #ifdef _LP64 |
2929 | testl(rax, 0x400); |
2930 | jcc(Assembler::notEqual, L); |
2931 | #else |
2932 | sahf(); |
2933 | jcc(Assembler::parity, L); |
2934 | #endif // _LP64 |
2935 | } |
2936 | restore_rax(tmp); |
2937 | // Result is in ST0. |
2938 | // Note: fxch & fpop to get rid of ST1 |
2939 | // (otherwise FPU stack could overflow eventually) |
2940 | fxch(1); |
2941 | fpop(); |
2942 | } |
2943 | |
2944 | // dst = c = a * b + c |
2945 | void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { |
2946 | Assembler::vfmadd231sd(c, a, b); |
2947 | if (dst != c) { |
2948 | movdbl(dst, c); |
2949 | } |
2950 | } |
2951 | |
2952 | // dst = c = a * b + c |
2953 | void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { |
2954 | Assembler::vfmadd231ss(c, a, b); |
2955 | if (dst != c) { |
2956 | movflt(dst, c); |
2957 | } |
2958 | } |
2959 | |
2960 | // dst = c = a * b + c |
2961 | void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { |
2962 | Assembler::vfmadd231pd(c, a, b, vector_len); |
2963 | if (dst != c) { |
2964 | vmovdqu(dst, c); |
2965 | } |
2966 | } |
2967 | |
2968 | // dst = c = a * b + c |
2969 | void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { |
2970 | Assembler::vfmadd231ps(c, a, b, vector_len); |
2971 | if (dst != c) { |
2972 | vmovdqu(dst, c); |
2973 | } |
2974 | } |
2975 | |
2976 | // dst = c = a * b + c |
2977 | void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { |
2978 | Assembler::vfmadd231pd(c, a, b, vector_len); |
2979 | if (dst != c) { |
2980 | vmovdqu(dst, c); |
2981 | } |
2982 | } |
2983 | |
2984 | // dst = c = a * b + c |
2985 | void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { |
2986 | Assembler::vfmadd231ps(c, a, b, vector_len); |
2987 | if (dst != c) { |
2988 | vmovdqu(dst, c); |
2989 | } |
2990 | } |
2991 | |
2992 | void MacroAssembler::incrementl(AddressLiteral dst) { |
2993 | if (reachable(dst)) { |
2994 | incrementl(as_Address(dst)); |
2995 | } else { |
2996 | lea(rscratch1, dst); |
2997 | incrementl(Address(rscratch1, 0)); |
2998 | } |
2999 | } |
3000 | |
3001 | void MacroAssembler::incrementl(ArrayAddress dst) { |
3002 | incrementl(as_Address(dst)); |
3003 | } |
3004 | |
3005 | void MacroAssembler::incrementl(Register reg, int value) { |
3006 | if (value == min_jint) {addl(reg, value) ; return; } |
3007 | if (value < 0) { decrementl(reg, -value); return; } |
3008 | if (value == 0) { ; return; } |
3009 | if (value == 1 && UseIncDec) { incl(reg) ; return; } |
3010 | /* else */ { addl(reg, value) ; return; } |
3011 | } |
3012 | |
3013 | void MacroAssembler::incrementl(Address dst, int value) { |
3014 | if (value == min_jint) {addl(dst, value) ; return; } |
3015 | if (value < 0) { decrementl(dst, -value); return; } |
3016 | if (value == 0) { ; return; } |
3017 | if (value == 1 && UseIncDec) { incl(dst) ; return; } |
3018 | /* else */ { addl(dst, value) ; return; } |
3019 | } |
3020 | |
3021 | void MacroAssembler::jump(AddressLiteral dst) { |
3022 | if (reachable(dst)) { |
3023 | jmp_literal(dst.target(), dst.rspec()); |
3024 | } else { |
3025 | lea(rscratch1, dst); |
3026 | jmp(rscratch1); |
3027 | } |
3028 | } |
3029 | |
3030 | void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) { |
3031 | if (reachable(dst)) { |
3032 | InstructionMark im(this); |
3033 | relocate(dst.reloc()); |
3034 | const int short_size = 2; |
3035 | const int long_size = 6; |
3036 | int offs = (intptr_t)dst.target() - ((intptr_t)pc()); |
3037 | if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { |
3038 | // 0111 tttn #8-bit disp |
3039 | emit_int8(0x70 | cc); |
3040 | emit_int8((offs - short_size) & 0xFF); |
3041 | } else { |
3042 | // 0000 1111 1000 tttn #32-bit disp |
3043 | emit_int8(0x0F); |
3044 | emit_int8((unsigned char)(0x80 | cc)); |
3045 | emit_int32(offs - long_size); |
3046 | } |
3047 | } else { |
3048 | #ifdef ASSERT |
3049 | warning("reversing conditional branch" ); |
3050 | #endif /* ASSERT */ |
3051 | Label skip; |
3052 | jccb(reverse[cc], skip); |
3053 | lea(rscratch1, dst); |
3054 | Assembler::jmp(rscratch1); |
3055 | bind(skip); |
3056 | } |
3057 | } |
3058 | |
3059 | void MacroAssembler::ldmxcsr(AddressLiteral src) { |
3060 | if (reachable(src)) { |
3061 | Assembler::ldmxcsr(as_Address(src)); |
3062 | } else { |
3063 | lea(rscratch1, src); |
3064 | Assembler::ldmxcsr(Address(rscratch1, 0)); |
3065 | } |
3066 | } |
3067 | |
3068 | int MacroAssembler::load_signed_byte(Register dst, Address src) { |
3069 | int off; |
3070 | if (LP64_ONLY(true ||) VM_Version::is_P6()) { |
3071 | off = offset(); |
3072 | movsbl(dst, src); // movsxb |
3073 | } else { |
3074 | off = load_unsigned_byte(dst, src); |
3075 | shll(dst, 24); |
3076 | sarl(dst, 24); |
3077 | } |
3078 | return off; |
3079 | } |
3080 | |
3081 | // Note: load_signed_short used to be called load_signed_word. |
3082 | // Although the 'w' in x86 opcodes refers to the term "word" in the assembler |
3083 | // manual, which means 16 bits, that usage is found nowhere in HotSpot code. |
3084 | // The term "word" in HotSpot means a 32- or 64-bit machine word. |
3085 | int MacroAssembler::load_signed_short(Register dst, Address src) { |
3086 | int off; |
3087 | if (LP64_ONLY(true ||) VM_Version::is_P6()) { |
3088 | // This is dubious to me since it seems safe to do a signed 16 => 64 bit |
3089 | // version but this is what 64bit has always done. This seems to imply |
3090 | // that users are only using 32bits worth. |
3091 | off = offset(); |
3092 | movswl(dst, src); // movsxw |
3093 | } else { |
3094 | off = load_unsigned_short(dst, src); |
3095 | shll(dst, 16); |
3096 | sarl(dst, 16); |
3097 | } |
3098 | return off; |
3099 | } |
3100 | |
3101 | int MacroAssembler::load_unsigned_byte(Register dst, Address src) { |
3102 | // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, |
3103 | // and "3.9 Partial Register Penalties", p. 22). |
3104 | int off; |
3105 | if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { |
3106 | off = offset(); |
3107 | movzbl(dst, src); // movzxb |
3108 | } else { |
3109 | xorl(dst, dst); |
3110 | off = offset(); |
3111 | movb(dst, src); |
3112 | } |
3113 | return off; |
3114 | } |
3115 | |
3116 | // Note: load_unsigned_short used to be called load_unsigned_word. |
3117 | int MacroAssembler::load_unsigned_short(Register dst, Address src) { |
3118 | // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, |
3119 | // and "3.9 Partial Register Penalties", p. 22). |
3120 | int off; |
3121 | if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { |
3122 | off = offset(); |
3123 | movzwl(dst, src); // movzxw |
3124 | } else { |
3125 | xorl(dst, dst); |
3126 | off = offset(); |
3127 | movw(dst, src); |
3128 | } |
3129 | return off; |
3130 | } |
3131 | |
3132 | void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { |
3133 | switch (size_in_bytes) { |
3134 | #ifndef _LP64 |
3135 | case 8: |
3136 | assert(dst2 != noreg, "second dest register required" ); |
3137 | movl(dst, src); |
3138 | movl(dst2, src.plus_disp(BytesPerInt)); |
3139 | break; |
3140 | #else |
3141 | case 8: movq(dst, src); break; |
3142 | #endif |
3143 | case 4: movl(dst, src); break; |
3144 | case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; |
3145 | case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; |
3146 | default: ShouldNotReachHere(); |
3147 | } |
3148 | } |
3149 | |
3150 | void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { |
3151 | switch (size_in_bytes) { |
3152 | #ifndef _LP64 |
3153 | case 8: |
3154 | assert(src2 != noreg, "second source register required" ); |
3155 | movl(dst, src); |
3156 | movl(dst.plus_disp(BytesPerInt), src2); |
3157 | break; |
3158 | #else |
3159 | case 8: movq(dst, src); break; |
3160 | #endif |
3161 | case 4: movl(dst, src); break; |
3162 | case 2: movw(dst, src); break; |
3163 | case 1: movb(dst, src); break; |
3164 | default: ShouldNotReachHere(); |
3165 | } |
3166 | } |
3167 | |
3168 | void MacroAssembler::mov32(AddressLiteral dst, Register src) { |
3169 | if (reachable(dst)) { |
3170 | movl(as_Address(dst), src); |
3171 | } else { |
3172 | lea(rscratch1, dst); |
3173 | movl(Address(rscratch1, 0), src); |
3174 | } |
3175 | } |
3176 | |
3177 | void MacroAssembler::mov32(Register dst, AddressLiteral src) { |
3178 | if (reachable(src)) { |
3179 | movl(dst, as_Address(src)); |
3180 | } else { |
3181 | lea(rscratch1, src); |
3182 | movl(dst, Address(rscratch1, 0)); |
3183 | } |
3184 | } |
3185 | |
3186 | // C++ bool manipulation |
3187 | |
3188 | void MacroAssembler::movbool(Register dst, Address src) { |
3189 | if(sizeof(bool) == 1) |
3190 | movb(dst, src); |
3191 | else if(sizeof(bool) == 2) |
3192 | movw(dst, src); |
3193 | else if(sizeof(bool) == 4) |
3194 | movl(dst, src); |
3195 | else |
3196 | // unsupported |
3197 | ShouldNotReachHere(); |
3198 | } |
3199 | |
3200 | void MacroAssembler::movbool(Address dst, bool boolconst) { |
3201 | if(sizeof(bool) == 1) |
3202 | movb(dst, (int) boolconst); |
3203 | else if(sizeof(bool) == 2) |
3204 | movw(dst, (int) boolconst); |
3205 | else if(sizeof(bool) == 4) |
3206 | movl(dst, (int) boolconst); |
3207 | else |
3208 | // unsupported |
3209 | ShouldNotReachHere(); |
3210 | } |
3211 | |
3212 | void MacroAssembler::movbool(Address dst, Register src) { |
3213 | if(sizeof(bool) == 1) |
3214 | movb(dst, src); |
3215 | else if(sizeof(bool) == 2) |
3216 | movw(dst, src); |
3217 | else if(sizeof(bool) == 4) |
3218 | movl(dst, src); |
3219 | else |
3220 | // unsupported |
3221 | ShouldNotReachHere(); |
3222 | } |
3223 | |
3224 | void MacroAssembler::movbyte(ArrayAddress dst, int src) { |
3225 | movb(as_Address(dst), src); |
3226 | } |
3227 | |
3228 | void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) { |
3229 | if (reachable(src)) { |
3230 | movdl(dst, as_Address(src)); |
3231 | } else { |
3232 | lea(rscratch1, src); |
3233 | movdl(dst, Address(rscratch1, 0)); |
3234 | } |
3235 | } |
3236 | |
3237 | void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { |
3238 | if (reachable(src)) { |
3239 | movq(dst, as_Address(src)); |
3240 | } else { |
3241 | lea(rscratch1, src); |
3242 | movq(dst, Address(rscratch1, 0)); |
3243 | } |
3244 | } |
3245 | |
3246 | #ifdef COMPILER2 |
3247 | void MacroAssembler::setvectmask(Register dst, Register src) { |
3248 | guarantee(PostLoopMultiversioning, "must be" ); |
3249 | Assembler::movl(dst, 1); |
3250 | Assembler::shlxl(dst, dst, src); |
3251 | Assembler::decl(dst); |
3252 | Assembler::kmovdl(k1, dst); |
3253 | Assembler::movl(dst, src); |
3254 | } |
3255 | |
3256 | void MacroAssembler::restorevectmask() { |
3257 | guarantee(PostLoopMultiversioning, "must be" ); |
3258 | Assembler::knotwl(k1, k0); |
3259 | } |
3260 | #endif // COMPILER2 |
3261 | |
3262 | void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) { |
3263 | if (reachable(src)) { |
3264 | if (UseXmmLoadAndClearUpper) { |
3265 | movsd (dst, as_Address(src)); |
3266 | } else { |
3267 | movlpd(dst, as_Address(src)); |
3268 | } |
3269 | } else { |
3270 | lea(rscratch1, src); |
3271 | if (UseXmmLoadAndClearUpper) { |
3272 | movsd (dst, Address(rscratch1, 0)); |
3273 | } else { |
3274 | movlpd(dst, Address(rscratch1, 0)); |
3275 | } |
3276 | } |
3277 | } |
3278 | |
3279 | void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) { |
3280 | if (reachable(src)) { |
3281 | movss(dst, as_Address(src)); |
3282 | } else { |
3283 | lea(rscratch1, src); |
3284 | movss(dst, Address(rscratch1, 0)); |
3285 | } |
3286 | } |
3287 | |
3288 | void MacroAssembler::movptr(Register dst, Register src) { |
3289 | LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); |
3290 | } |
3291 | |
3292 | void MacroAssembler::movptr(Register dst, Address src) { |
3293 | LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); |
3294 | } |
3295 | |
3296 | // src should NEVER be a real pointer. Use AddressLiteral for true pointers |
3297 | void MacroAssembler::movptr(Register dst, intptr_t src) { |
3298 | LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src)); |
3299 | } |
3300 | |
3301 | void MacroAssembler::movptr(Address dst, Register src) { |
3302 | LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); |
3303 | } |
3304 | |
3305 | void MacroAssembler::movdqu(Address dst, XMMRegister src) { |
3306 | assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3307 | Assembler::movdqu(dst, src); |
3308 | } |
3309 | |
3310 | void MacroAssembler::movdqu(XMMRegister dst, Address src) { |
3311 | assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3312 | Assembler::movdqu(dst, src); |
3313 | } |
3314 | |
3315 | void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { |
3316 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3317 | Assembler::movdqu(dst, src); |
3318 | } |
3319 | |
3320 | void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) { |
3321 | if (reachable(src)) { |
3322 | movdqu(dst, as_Address(src)); |
3323 | } else { |
3324 | lea(scratchReg, src); |
3325 | movdqu(dst, Address(scratchReg, 0)); |
3326 | } |
3327 | } |
3328 | |
3329 | void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { |
3330 | assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3331 | Assembler::vmovdqu(dst, src); |
3332 | } |
3333 | |
3334 | void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { |
3335 | assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3336 | Assembler::vmovdqu(dst, src); |
3337 | } |
3338 | |
3339 | void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { |
3340 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3341 | Assembler::vmovdqu(dst, src); |
3342 | } |
3343 | |
3344 | void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) { |
3345 | if (reachable(src)) { |
3346 | vmovdqu(dst, as_Address(src)); |
3347 | } |
3348 | else { |
3349 | lea(scratch_reg, src); |
3350 | vmovdqu(dst, Address(scratch_reg, 0)); |
3351 | } |
3352 | } |
3353 | |
3354 | void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { |
3355 | if (reachable(src)) { |
3356 | Assembler::evmovdquq(dst, as_Address(src), vector_len); |
3357 | } else { |
3358 | lea(rscratch, src); |
3359 | Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); |
3360 | } |
3361 | } |
3362 | |
3363 | void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { |
3364 | if (reachable(src)) { |
3365 | Assembler::movdqa(dst, as_Address(src)); |
3366 | } else { |
3367 | lea(rscratch1, src); |
3368 | Assembler::movdqa(dst, Address(rscratch1, 0)); |
3369 | } |
3370 | } |
3371 | |
3372 | void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { |
3373 | if (reachable(src)) { |
3374 | Assembler::movsd(dst, as_Address(src)); |
3375 | } else { |
3376 | lea(rscratch1, src); |
3377 | Assembler::movsd(dst, Address(rscratch1, 0)); |
3378 | } |
3379 | } |
3380 | |
3381 | void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) { |
3382 | if (reachable(src)) { |
3383 | Assembler::movss(dst, as_Address(src)); |
3384 | } else { |
3385 | lea(rscratch1, src); |
3386 | Assembler::movss(dst, Address(rscratch1, 0)); |
3387 | } |
3388 | } |
3389 | |
3390 | void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) { |
3391 | if (reachable(src)) { |
3392 | Assembler::mulsd(dst, as_Address(src)); |
3393 | } else { |
3394 | lea(rscratch1, src); |
3395 | Assembler::mulsd(dst, Address(rscratch1, 0)); |
3396 | } |
3397 | } |
3398 | |
3399 | void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) { |
3400 | if (reachable(src)) { |
3401 | Assembler::mulss(dst, as_Address(src)); |
3402 | } else { |
3403 | lea(rscratch1, src); |
3404 | Assembler::mulss(dst, Address(rscratch1, 0)); |
3405 | } |
3406 | } |
3407 | |
3408 | void MacroAssembler::null_check(Register reg, int offset) { |
3409 | if (needs_explicit_null_check(offset)) { |
3410 | // provoke OS NULL exception if reg = NULL by |
3411 | // accessing M[reg] w/o changing any (non-CC) registers |
3412 | // NOTE: cmpl is plenty here to provoke a segv |
3413 | cmpptr(rax, Address(reg, 0)); |
3414 | // Note: should probably use testl(rax, Address(reg, 0)); |
3415 | // may be shorter code (however, this version of |
3416 | // testl needs to be implemented first) |
3417 | } else { |
3418 | // nothing to do, (later) access of M[reg + offset] |
3419 | // will provoke OS NULL exception if reg = NULL |
3420 | } |
3421 | } |
3422 | |
3423 | void MacroAssembler::os_breakpoint() { |
3424 | // instead of directly emitting a breakpoint, call os:breakpoint for better debugability |
3425 | // (e.g., MSVC can't call ps() otherwise) |
3426 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); |
3427 | } |
3428 | |
3429 | void MacroAssembler::unimplemented(const char* what) { |
3430 | const char* buf = NULL; |
3431 | { |
3432 | ResourceMark rm; |
3433 | stringStream ss; |
3434 | ss.print("unimplemented: %s" , what); |
3435 | buf = code_string(ss.as_string()); |
3436 | } |
3437 | stop(buf); |
3438 | } |
3439 | |
3440 | #ifdef _LP64 |
3441 | #define XSTATE_BV 0x200 |
3442 | #endif |
3443 | |
3444 | void MacroAssembler::pop_CPU_state() { |
3445 | pop_FPU_state(); |
3446 | pop_IU_state(); |
3447 | } |
3448 | |
3449 | void MacroAssembler::pop_FPU_state() { |
3450 | #ifndef _LP64 |
3451 | frstor(Address(rsp, 0)); |
3452 | #else |
3453 | fxrstor(Address(rsp, 0)); |
3454 | #endif |
3455 | addptr(rsp, FPUStateSizeInWords * wordSize); |
3456 | } |
3457 | |
3458 | void MacroAssembler::pop_IU_state() { |
3459 | popa(); |
3460 | LP64_ONLY(addq(rsp, 8)); |
3461 | popf(); |
3462 | } |
3463 | |
3464 | // Save Integer and Float state |
3465 | // Warning: Stack must be 16 byte aligned (64bit) |
3466 | void MacroAssembler::push_CPU_state() { |
3467 | push_IU_state(); |
3468 | push_FPU_state(); |
3469 | } |
3470 | |
3471 | void MacroAssembler::push_FPU_state() { |
3472 | subptr(rsp, FPUStateSizeInWords * wordSize); |
3473 | #ifndef _LP64 |
3474 | fnsave(Address(rsp, 0)); |
3475 | fwait(); |
3476 | #else |
3477 | fxsave(Address(rsp, 0)); |
3478 | #endif // LP64 |
3479 | } |
3480 | |
3481 | void MacroAssembler::push_IU_state() { |
3482 | // Push flags first because pusha kills them |
3483 | pushf(); |
3484 | // Make sure rsp stays 16-byte aligned |
3485 | LP64_ONLY(subq(rsp, 8)); |
3486 | pusha(); |
3487 | } |
3488 | |
3489 | void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register |
3490 | if (!java_thread->is_valid()) { |
3491 | java_thread = rdi; |
3492 | get_thread(java_thread); |
3493 | } |
3494 | // we must set sp to zero to clear frame |
3495 | movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); |
3496 | if (clear_fp) { |
3497 | movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); |
3498 | } |
3499 | |
3500 | // Always clear the pc because it could have been set by make_walkable() |
3501 | movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); |
3502 | |
3503 | vzeroupper(); |
3504 | } |
3505 | |
3506 | void MacroAssembler::restore_rax(Register tmp) { |
3507 | if (tmp == noreg) pop(rax); |
3508 | else if (tmp != rax) mov(rax, tmp); |
3509 | } |
3510 | |
3511 | void MacroAssembler::round_to(Register reg, int modulus) { |
3512 | addptr(reg, modulus - 1); |
3513 | andptr(reg, -modulus); |
3514 | } |
3515 | |
3516 | void MacroAssembler::save_rax(Register tmp) { |
3517 | if (tmp == noreg) push(rax); |
3518 | else if (tmp != rax) mov(tmp, rax); |
3519 | } |
3520 | |
3521 | void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) { |
3522 | if (SafepointMechanism::uses_thread_local_poll()) { |
3523 | #ifdef _LP64 |
3524 | assert(thread_reg == r15_thread, "should be" ); |
3525 | #else |
3526 | if (thread_reg == noreg) { |
3527 | thread_reg = temp_reg; |
3528 | get_thread(thread_reg); |
3529 | } |
3530 | #endif |
3531 | testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit()); |
3532 | jcc(Assembler::notZero, slow_path); // handshake bit set implies poll |
3533 | } else { |
3534 | cmp32(ExternalAddress(SafepointSynchronize::address_of_state()), |
3535 | SafepointSynchronize::_not_synchronized); |
3536 | jcc(Assembler::notEqual, slow_path); |
3537 | } |
3538 | } |
3539 | |
3540 | // Calls to C land |
3541 | // |
3542 | // When entering C land, the rbp, & rsp of the last Java frame have to be recorded |
3543 | // in the (thread-local) JavaThread object. When leaving C land, the last Java fp |
3544 | // has to be reset to 0. This is required to allow proper stack traversal. |
3545 | void MacroAssembler::set_last_Java_frame(Register java_thread, |
3546 | Register last_java_sp, |
3547 | Register last_java_fp, |
3548 | address last_java_pc) { |
3549 | vzeroupper(); |
3550 | // determine java_thread register |
3551 | if (!java_thread->is_valid()) { |
3552 | java_thread = rdi; |
3553 | get_thread(java_thread); |
3554 | } |
3555 | // determine last_java_sp register |
3556 | if (!last_java_sp->is_valid()) { |
3557 | last_java_sp = rsp; |
3558 | } |
3559 | |
3560 | // last_java_fp is optional |
3561 | |
3562 | if (last_java_fp->is_valid()) { |
3563 | movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); |
3564 | } |
3565 | |
3566 | // last_java_pc is optional |
3567 | |
3568 | if (last_java_pc != NULL) { |
3569 | lea(Address(java_thread, |
3570 | JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()), |
3571 | InternalAddress(last_java_pc)); |
3572 | |
3573 | } |
3574 | movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); |
3575 | } |
3576 | |
3577 | void MacroAssembler::shlptr(Register dst, int imm8) { |
3578 | LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); |
3579 | } |
3580 | |
3581 | void MacroAssembler::shrptr(Register dst, int imm8) { |
3582 | LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); |
3583 | } |
3584 | |
3585 | void MacroAssembler::sign_extend_byte(Register reg) { |
3586 | if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { |
3587 | movsbl(reg, reg); // movsxb |
3588 | } else { |
3589 | shll(reg, 24); |
3590 | sarl(reg, 24); |
3591 | } |
3592 | } |
3593 | |
3594 | void MacroAssembler::sign_extend_short(Register reg) { |
3595 | if (LP64_ONLY(true ||) VM_Version::is_P6()) { |
3596 | movswl(reg, reg); // movsxw |
3597 | } else { |
3598 | shll(reg, 16); |
3599 | sarl(reg, 16); |
3600 | } |
3601 | } |
3602 | |
3603 | void MacroAssembler::testl(Register dst, AddressLiteral src) { |
3604 | assert(reachable(src), "Address should be reachable" ); |
3605 | testl(dst, as_Address(src)); |
3606 | } |
3607 | |
3608 | void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { |
3609 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3610 | Assembler::pcmpeqb(dst, src); |
3611 | } |
3612 | |
3613 | void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { |
3614 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3615 | Assembler::pcmpeqw(dst, src); |
3616 | } |
3617 | |
3618 | void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { |
3619 | assert((dst->encoding() < 16),"XMM register should be 0-15" ); |
3620 | Assembler::pcmpestri(dst, src, imm8); |
3621 | } |
3622 | |
3623 | void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { |
3624 | assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15" ); |
3625 | Assembler::pcmpestri(dst, src, imm8); |
3626 | } |
3627 | |
3628 | void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { |
3629 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3630 | Assembler::pmovzxbw(dst, src); |
3631 | } |
3632 | |
3633 | void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { |
3634 | assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3635 | Assembler::pmovzxbw(dst, src); |
3636 | } |
3637 | |
3638 | void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { |
3639 | assert((src->encoding() < 16),"XMM register should be 0-15" ); |
3640 | Assembler::pmovmskb(dst, src); |
3641 | } |
3642 | |
3643 | void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { |
3644 | assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15" ); |
3645 | Assembler::ptest(dst, src); |
3646 | } |
3647 | |
3648 | void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) { |
3649 | if (reachable(src)) { |
3650 | Assembler::sqrtsd(dst, as_Address(src)); |
3651 | } else { |
3652 | lea(rscratch1, src); |
3653 | Assembler::sqrtsd(dst, Address(rscratch1, 0)); |
3654 | } |
3655 | } |
3656 | |
3657 | void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) { |
3658 | if (reachable(src)) { |
3659 | Assembler::sqrtss(dst, as_Address(src)); |
3660 | } else { |
3661 | lea(rscratch1, src); |
3662 | Assembler::sqrtss(dst, Address(rscratch1, 0)); |
3663 | } |
3664 | } |
3665 | |
3666 | void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) { |
3667 | if (reachable(src)) { |
3668 | Assembler::subsd(dst, as_Address(src)); |
3669 | } else { |
3670 | lea(rscratch1, src); |
3671 | Assembler::subsd(dst, Address(rscratch1, 0)); |
3672 | } |
3673 | } |
3674 | |
3675 | void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) { |
3676 | if (reachable(src)) { |
3677 | Assembler::subss(dst, as_Address(src)); |
3678 | } else { |
3679 | lea(rscratch1, src); |
3680 | Assembler::subss(dst, Address(rscratch1, 0)); |
3681 | } |
3682 | } |
3683 | |
3684 | void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) { |
3685 | if (reachable(src)) { |
3686 | Assembler::ucomisd(dst, as_Address(src)); |
3687 | } else { |
3688 | lea(rscratch1, src); |
3689 | Assembler::ucomisd(dst, Address(rscratch1, 0)); |
3690 | } |
3691 | } |
3692 | |
3693 | void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) { |
3694 | if (reachable(src)) { |
3695 | Assembler::ucomiss(dst, as_Address(src)); |
3696 | } else { |
3697 | lea(rscratch1, src); |
3698 | Assembler::ucomiss(dst, Address(rscratch1, 0)); |
3699 | } |
3700 | } |
3701 | |
3702 | void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { |
3703 | // Used in sign-bit flipping with aligned address. |
3704 | assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes" ); |
3705 | if (reachable(src)) { |
3706 | Assembler::xorpd(dst, as_Address(src)); |
3707 | } else { |
3708 | lea(scratch_reg, src); |
3709 | Assembler::xorpd(dst, Address(scratch_reg, 0)); |
3710 | } |
3711 | } |
3712 | |
3713 | void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { |
3714 | if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { |
3715 | Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); |
3716 | } |
3717 | else { |
3718 | Assembler::xorpd(dst, src); |
3719 | } |
3720 | } |
3721 | |
3722 | void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { |
3723 | if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { |
3724 | Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); |
3725 | } else { |
3726 | Assembler::xorps(dst, src); |
3727 | } |
3728 | } |
3729 | |
3730 | void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { |
3731 | // Used in sign-bit flipping with aligned address. |
3732 | assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes" ); |
3733 | if (reachable(src)) { |
3734 | Assembler::xorps(dst, as_Address(src)); |
3735 | } else { |
3736 | lea(scratch_reg, src); |
3737 | Assembler::xorps(dst, Address(scratch_reg, 0)); |
3738 | } |
3739 | } |
3740 | |
3741 | void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { |
3742 | // Used in sign-bit flipping with aligned address. |
3743 | bool aligned_adr = (((intptr_t)src.target() & 15) == 0); |
3744 | assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes" ); |
3745 | if (reachable(src)) { |
3746 | Assembler::pshufb(dst, as_Address(src)); |
3747 | } else { |
3748 | lea(rscratch1, src); |
3749 | Assembler::pshufb(dst, Address(rscratch1, 0)); |
3750 | } |
3751 | } |
3752 | |
3753 | // AVX 3-operands instructions |
3754 | |
3755 | void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3756 | if (reachable(src)) { |
3757 | vaddsd(dst, nds, as_Address(src)); |
3758 | } else { |
3759 | lea(rscratch1, src); |
3760 | vaddsd(dst, nds, Address(rscratch1, 0)); |
3761 | } |
3762 | } |
3763 | |
3764 | void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3765 | if (reachable(src)) { |
3766 | vaddss(dst, nds, as_Address(src)); |
3767 | } else { |
3768 | lea(rscratch1, src); |
3769 | vaddss(dst, nds, Address(rscratch1, 0)); |
3770 | } |
3771 | } |
3772 | |
3773 | void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { |
3774 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15" ); |
3775 | vandps(dst, nds, negate_field, vector_len); |
3776 | } |
3777 | |
3778 | void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { |
3779 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15" ); |
3780 | vandpd(dst, nds, negate_field, vector_len); |
3781 | } |
3782 | |
3783 | void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3784 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3785 | Assembler::vpaddb(dst, nds, src, vector_len); |
3786 | } |
3787 | |
3788 | void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { |
3789 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3790 | Assembler::vpaddb(dst, nds, src, vector_len); |
3791 | } |
3792 | |
3793 | void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3794 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3795 | Assembler::vpaddw(dst, nds, src, vector_len); |
3796 | } |
3797 | |
3798 | void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { |
3799 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3800 | Assembler::vpaddw(dst, nds, src, vector_len); |
3801 | } |
3802 | |
3803 | void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
3804 | if (reachable(src)) { |
3805 | Assembler::vpand(dst, nds, as_Address(src), vector_len); |
3806 | } else { |
3807 | lea(scratch_reg, src); |
3808 | Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len); |
3809 | } |
3810 | } |
3811 | |
3812 | void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { |
3813 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3814 | Assembler::vpbroadcastw(dst, src, vector_len); |
3815 | } |
3816 | |
3817 | void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3818 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3819 | Assembler::vpcmpeqb(dst, nds, src, vector_len); |
3820 | } |
3821 | |
3822 | void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3823 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3824 | Assembler::vpcmpeqw(dst, nds, src, vector_len); |
3825 | } |
3826 | |
3827 | void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { |
3828 | assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3829 | Assembler::vpmovzxbw(dst, src, vector_len); |
3830 | } |
3831 | |
3832 | void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) { |
3833 | assert((src->encoding() < 16),"XMM register should be 0-15" ); |
3834 | Assembler::vpmovmskb(dst, src); |
3835 | } |
3836 | |
3837 | void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3838 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3839 | Assembler::vpmullw(dst, nds, src, vector_len); |
3840 | } |
3841 | |
3842 | void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { |
3843 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3844 | Assembler::vpmullw(dst, nds, src, vector_len); |
3845 | } |
3846 | |
3847 | void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3848 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3849 | Assembler::vpsubb(dst, nds, src, vector_len); |
3850 | } |
3851 | |
3852 | void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { |
3853 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3854 | Assembler::vpsubb(dst, nds, src, vector_len); |
3855 | } |
3856 | |
3857 | void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
3858 | assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3859 | Assembler::vpsubw(dst, nds, src, vector_len); |
3860 | } |
3861 | |
3862 | void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { |
3863 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3864 | Assembler::vpsubw(dst, nds, src, vector_len); |
3865 | } |
3866 | |
3867 | void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { |
3868 | assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3869 | Assembler::vpsraw(dst, nds, shift, vector_len); |
3870 | } |
3871 | |
3872 | void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { |
3873 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3874 | Assembler::vpsraw(dst, nds, shift, vector_len); |
3875 | } |
3876 | |
3877 | void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { |
3878 | assert(UseAVX > 2,"" ); |
3879 | if (!VM_Version::supports_avx512vl() && vector_len < 2) { |
3880 | vector_len = 2; |
3881 | } |
3882 | Assembler::evpsraq(dst, nds, shift, vector_len); |
3883 | } |
3884 | |
3885 | void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { |
3886 | assert(UseAVX > 2,"" ); |
3887 | if (!VM_Version::supports_avx512vl() && vector_len < 2) { |
3888 | vector_len = 2; |
3889 | } |
3890 | Assembler::evpsraq(dst, nds, shift, vector_len); |
3891 | } |
3892 | |
3893 | void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { |
3894 | assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3895 | Assembler::vpsrlw(dst, nds, shift, vector_len); |
3896 | } |
3897 | |
3898 | void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { |
3899 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3900 | Assembler::vpsrlw(dst, nds, shift, vector_len); |
3901 | } |
3902 | |
3903 | void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { |
3904 | assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3905 | Assembler::vpsllw(dst, nds, shift, vector_len); |
3906 | } |
3907 | |
3908 | void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { |
3909 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3910 | Assembler::vpsllw(dst, nds, shift, vector_len); |
3911 | } |
3912 | |
3913 | void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { |
3914 | assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15" ); |
3915 | Assembler::vptest(dst, src); |
3916 | } |
3917 | |
3918 | void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { |
3919 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3920 | Assembler::punpcklbw(dst, src); |
3921 | } |
3922 | |
3923 | void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { |
3924 | assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15" ); |
3925 | Assembler::pshufd(dst, src, mode); |
3926 | } |
3927 | |
3928 | void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { |
3929 | assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15" ); |
3930 | Assembler::pshuflw(dst, src, mode); |
3931 | } |
3932 | |
3933 | void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
3934 | if (reachable(src)) { |
3935 | vandpd(dst, nds, as_Address(src), vector_len); |
3936 | } else { |
3937 | lea(scratch_reg, src); |
3938 | vandpd(dst, nds, Address(scratch_reg, 0), vector_len); |
3939 | } |
3940 | } |
3941 | |
3942 | void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
3943 | if (reachable(src)) { |
3944 | vandps(dst, nds, as_Address(src), vector_len); |
3945 | } else { |
3946 | lea(scratch_reg, src); |
3947 | vandps(dst, nds, Address(scratch_reg, 0), vector_len); |
3948 | } |
3949 | } |
3950 | |
3951 | void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3952 | if (reachable(src)) { |
3953 | vdivsd(dst, nds, as_Address(src)); |
3954 | } else { |
3955 | lea(rscratch1, src); |
3956 | vdivsd(dst, nds, Address(rscratch1, 0)); |
3957 | } |
3958 | } |
3959 | |
3960 | void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3961 | if (reachable(src)) { |
3962 | vdivss(dst, nds, as_Address(src)); |
3963 | } else { |
3964 | lea(rscratch1, src); |
3965 | vdivss(dst, nds, Address(rscratch1, 0)); |
3966 | } |
3967 | } |
3968 | |
3969 | void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3970 | if (reachable(src)) { |
3971 | vmulsd(dst, nds, as_Address(src)); |
3972 | } else { |
3973 | lea(rscratch1, src); |
3974 | vmulsd(dst, nds, Address(rscratch1, 0)); |
3975 | } |
3976 | } |
3977 | |
3978 | void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3979 | if (reachable(src)) { |
3980 | vmulss(dst, nds, as_Address(src)); |
3981 | } else { |
3982 | lea(rscratch1, src); |
3983 | vmulss(dst, nds, Address(rscratch1, 0)); |
3984 | } |
3985 | } |
3986 | |
3987 | void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3988 | if (reachable(src)) { |
3989 | vsubsd(dst, nds, as_Address(src)); |
3990 | } else { |
3991 | lea(rscratch1, src); |
3992 | vsubsd(dst, nds, Address(rscratch1, 0)); |
3993 | } |
3994 | } |
3995 | |
3996 | void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
3997 | if (reachable(src)) { |
3998 | vsubss(dst, nds, as_Address(src)); |
3999 | } else { |
4000 | lea(rscratch1, src); |
4001 | vsubss(dst, nds, Address(rscratch1, 0)); |
4002 | } |
4003 | } |
4004 | |
4005 | void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
4006 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15" ); |
4007 | vxorps(dst, nds, src, Assembler::AVX_128bit); |
4008 | } |
4009 | |
4010 | void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { |
4011 | assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15" ); |
4012 | vxorpd(dst, nds, src, Assembler::AVX_128bit); |
4013 | } |
4014 | |
4015 | void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
4016 | if (reachable(src)) { |
4017 | vxorpd(dst, nds, as_Address(src), vector_len); |
4018 | } else { |
4019 | lea(scratch_reg, src); |
4020 | vxorpd(dst, nds, Address(scratch_reg, 0), vector_len); |
4021 | } |
4022 | } |
4023 | |
4024 | void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
4025 | if (reachable(src)) { |
4026 | vxorps(dst, nds, as_Address(src), vector_len); |
4027 | } else { |
4028 | lea(scratch_reg, src); |
4029 | vxorps(dst, nds, Address(scratch_reg, 0), vector_len); |
4030 | } |
4031 | } |
4032 | |
4033 | void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { |
4034 | if (UseAVX > 1 || (vector_len < 1)) { |
4035 | if (reachable(src)) { |
4036 | Assembler::vpxor(dst, nds, as_Address(src), vector_len); |
4037 | } else { |
4038 | lea(scratch_reg, src); |
4039 | Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len); |
4040 | } |
4041 | } |
4042 | else { |
4043 | MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg); |
4044 | } |
4045 | } |
4046 | |
4047 | //------------------------------------------------------------------------------------------- |
4048 | #ifdef COMPILER2 |
4049 | // Generic instructions support for use in .ad files C2 code generation |
4050 | |
4051 | void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) { |
4052 | if (opcode == Op_AbsVD) { |
4053 | andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); |
4054 | } else { |
4055 | assert((opcode == Op_NegVD),"opcode should be Op_NegD" ); |
4056 | xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); |
4057 | } |
4058 | } |
4059 | |
4060 | void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { |
4061 | if (opcode == Op_AbsVD) { |
4062 | vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); |
4063 | } else { |
4064 | assert((opcode == Op_NegVD),"opcode should be Op_NegD" ); |
4065 | vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); |
4066 | } |
4067 | } |
4068 | |
4069 | void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) { |
4070 | if (opcode == Op_AbsVF) { |
4071 | andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); |
4072 | } else { |
4073 | assert((opcode == Op_NegVF),"opcode should be Op_NegF" ); |
4074 | xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); |
4075 | } |
4076 | } |
4077 | |
4078 | void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { |
4079 | if (opcode == Op_AbsVF) { |
4080 | vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); |
4081 | } else { |
4082 | assert((opcode == Op_NegVF),"opcode should be Op_NegF" ); |
4083 | vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); |
4084 | } |
4085 | } |
4086 | |
4087 | void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { |
4088 | if (sign) { |
4089 | pmovsxbw(dst, src); |
4090 | } else { |
4091 | pmovzxbw(dst, src); |
4092 | } |
4093 | } |
4094 | |
4095 | void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { |
4096 | if (sign) { |
4097 | vpmovsxbw(dst, src, vector_len); |
4098 | } else { |
4099 | vpmovzxbw(dst, src, vector_len); |
4100 | } |
4101 | } |
4102 | |
4103 | void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { |
4104 | if (opcode == Op_RShiftVI) { |
4105 | psrad(dst, src); |
4106 | } else if (opcode == Op_LShiftVI) { |
4107 | pslld(dst, src); |
4108 | } else { |
4109 | assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI" ); |
4110 | psrld(dst, src); |
4111 | } |
4112 | } |
4113 | |
4114 | void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
4115 | if (opcode == Op_RShiftVI) { |
4116 | vpsrad(dst, nds, src, vector_len); |
4117 | } else if (opcode == Op_LShiftVI) { |
4118 | vpslld(dst, nds, src, vector_len); |
4119 | } else { |
4120 | assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI" ); |
4121 | vpsrld(dst, nds, src, vector_len); |
4122 | } |
4123 | } |
4124 | |
4125 | void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { |
4126 | if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { |
4127 | psraw(dst, src); |
4128 | } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { |
4129 | psllw(dst, src); |
4130 | } else { |
4131 | assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB" ); |
4132 | psrlw(dst, src); |
4133 | } |
4134 | } |
4135 | |
4136 | void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
4137 | if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { |
4138 | vpsraw(dst, nds, src, vector_len); |
4139 | } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { |
4140 | vpsllw(dst, nds, src, vector_len); |
4141 | } else { |
4142 | assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB" ); |
4143 | vpsrlw(dst, nds, src, vector_len); |
4144 | } |
4145 | } |
4146 | |
4147 | void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { |
4148 | if (opcode == Op_RShiftVL) { |
4149 | psrlq(dst, src); // using srl to implement sra on pre-avs512 systems |
4150 | } else if (opcode == Op_LShiftVL) { |
4151 | psllq(dst, src); |
4152 | } else { |
4153 | assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL" ); |
4154 | psrlq(dst, src); |
4155 | } |
4156 | } |
4157 | |
4158 | void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { |
4159 | if (opcode == Op_RShiftVL) { |
4160 | evpsraq(dst, nds, src, vector_len); |
4161 | } else if (opcode == Op_LShiftVL) { |
4162 | vpsllq(dst, nds, src, vector_len); |
4163 | } else { |
4164 | assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL" ); |
4165 | vpsrlq(dst, nds, src, vector_len); |
4166 | } |
4167 | } |
4168 | #endif |
4169 | //------------------------------------------------------------------------------------------- |
4170 | |
4171 | void MacroAssembler::clear_jweak_tag(Register possibly_jweak) { |
4172 | const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask); |
4173 | STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code |
4174 | // The inverted mask is sign-extended |
4175 | andptr(possibly_jweak, inverted_jweak_mask); |
4176 | } |
4177 | |
4178 | void MacroAssembler::resolve_jobject(Register value, |
4179 | Register thread, |
4180 | Register tmp) { |
4181 | assert_different_registers(value, thread, tmp); |
4182 | Label done, not_weak; |
4183 | testptr(value, value); |
4184 | jcc(Assembler::zero, done); // Use NULL as-is. |
4185 | testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag. |
4186 | jcc(Assembler::zero, not_weak); |
4187 | // Resolve jweak. |
4188 | access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, |
4189 | value, Address(value, -JNIHandles::weak_tag_value), tmp, thread); |
4190 | verify_oop(value); |
4191 | jmp(done); |
4192 | bind(not_weak); |
4193 | // Resolve (untagged) jobject. |
4194 | access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); |
4195 | verify_oop(value); |
4196 | bind(done); |
4197 | } |
4198 | |
4199 | void MacroAssembler::subptr(Register dst, int32_t imm32) { |
4200 | LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); |
4201 | } |
4202 | |
4203 | // Force generation of a 4 byte immediate value even if it fits into 8bit |
4204 | void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { |
4205 | LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); |
4206 | } |
4207 | |
4208 | void MacroAssembler::subptr(Register dst, Register src) { |
4209 | LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); |
4210 | } |
4211 | |
4212 | // C++ bool manipulation |
4213 | void MacroAssembler::testbool(Register dst) { |
4214 | if(sizeof(bool) == 1) |
4215 | testb(dst, 0xff); |
4216 | else if(sizeof(bool) == 2) { |
4217 | // testw implementation needed for two byte bools |
4218 | ShouldNotReachHere(); |
4219 | } else if(sizeof(bool) == 4) |
4220 | testl(dst, dst); |
4221 | else |
4222 | // unsupported |
4223 | ShouldNotReachHere(); |
4224 | } |
4225 | |
4226 | void MacroAssembler::testptr(Register dst, Register src) { |
4227 | LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); |
4228 | } |
4229 | |
4230 | // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. |
4231 | void MacroAssembler::tlab_allocate(Register thread, Register obj, |
4232 | Register var_size_in_bytes, |
4233 | int con_size_in_bytes, |
4234 | Register t1, |
4235 | Register t2, |
4236 | Label& slow_case) { |
4237 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
4238 | bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); |
4239 | } |
4240 | |
4241 | // Defines obj, preserves var_size_in_bytes |
4242 | void MacroAssembler::eden_allocate(Register thread, Register obj, |
4243 | Register var_size_in_bytes, |
4244 | int con_size_in_bytes, |
4245 | Register t1, |
4246 | Label& slow_case) { |
4247 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
4248 | bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); |
4249 | } |
4250 | |
4251 | // Preserves the contents of address, destroys the contents length_in_bytes and temp. |
4252 | void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { |
4253 | assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different" ); |
4254 | assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord" ); |
4255 | Label done; |
4256 | |
4257 | testptr(length_in_bytes, length_in_bytes); |
4258 | jcc(Assembler::zero, done); |
4259 | |
4260 | // initialize topmost word, divide index by 2, check if odd and test if zero |
4261 | // note: for the remaining code to work, index must be a multiple of BytesPerWord |
4262 | #ifdef ASSERT |
4263 | { |
4264 | Label L; |
4265 | testptr(length_in_bytes, BytesPerWord - 1); |
4266 | jcc(Assembler::zero, L); |
4267 | stop("length must be a multiple of BytesPerWord" ); |
4268 | bind(L); |
4269 | } |
4270 | #endif |
4271 | Register index = length_in_bytes; |
4272 | xorptr(temp, temp); // use _zero reg to clear memory (shorter code) |
4273 | if (UseIncDec) { |
4274 | shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set |
4275 | } else { |
4276 | shrptr(index, 2); // use 2 instructions to avoid partial flag stall |
4277 | shrptr(index, 1); |
4278 | } |
4279 | #ifndef _LP64 |
4280 | // index could have not been a multiple of 8 (i.e., bit 2 was set) |
4281 | { |
4282 | Label even; |
4283 | // note: if index was a multiple of 8, then it cannot |
4284 | // be 0 now otherwise it must have been 0 before |
4285 | // => if it is even, we don't need to check for 0 again |
4286 | jcc(Assembler::carryClear, even); |
4287 | // clear topmost word (no jump would be needed if conditional assignment worked here) |
4288 | movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); |
4289 | // index could be 0 now, must check again |
4290 | jcc(Assembler::zero, done); |
4291 | bind(even); |
4292 | } |
4293 | #endif // !_LP64 |
4294 | // initialize remaining object fields: index is a multiple of 2 now |
4295 | { |
4296 | Label loop; |
4297 | bind(loop); |
4298 | movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); |
4299 | NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) |
4300 | decrement(index); |
4301 | jcc(Assembler::notZero, loop); |
4302 | } |
4303 | |
4304 | bind(done); |
4305 | } |
4306 | |
4307 | // Look up the method for a megamorphic invokeinterface call. |
4308 | // The target method is determined by <intf_klass, itable_index>. |
4309 | // The receiver klass is in recv_klass. |
4310 | // On success, the result will be in method_result, and execution falls through. |
4311 | // On failure, execution transfers to the given label. |
4312 | void MacroAssembler::lookup_interface_method(Register recv_klass, |
4313 | Register intf_klass, |
4314 | RegisterOrConstant itable_index, |
4315 | Register method_result, |
4316 | Register scan_temp, |
4317 | Label& L_no_such_interface, |
4318 | bool return_method) { |
4319 | assert_different_registers(recv_klass, intf_klass, scan_temp); |
4320 | assert_different_registers(method_result, intf_klass, scan_temp); |
4321 | assert(recv_klass != method_result || !return_method, |
4322 | "recv_klass can be destroyed when method isn't needed" ); |
4323 | |
4324 | assert(itable_index.is_constant() || itable_index.as_register() == method_result, |
4325 | "caller must use same register for non-constant itable index as for method" ); |
4326 | |
4327 | // Compute start of first itableOffsetEntry (which is at the end of the vtable) |
4328 | int vtable_base = in_bytes(Klass::vtable_start_offset()); |
4329 | int itentry_off = itableMethodEntry::method_offset_in_bytes(); |
4330 | int scan_step = itableOffsetEntry::size() * wordSize; |
4331 | int vte_size = vtableEntry::size_in_bytes(); |
4332 | Address::ScaleFactor times_vte_scale = Address::times_ptr; |
4333 | assert(vte_size == wordSize, "else adjust times_vte_scale" ); |
4334 | |
4335 | movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); |
4336 | |
4337 | // %%% Could store the aligned, prescaled offset in the klassoop. |
4338 | lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); |
4339 | |
4340 | if (return_method) { |
4341 | // Adjust recv_klass by scaled itable_index, so we can free itable_index. |
4342 | assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below" ); |
4343 | lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); |
4344 | } |
4345 | |
4346 | // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { |
4347 | // if (scan->interface() == intf) { |
4348 | // result = (klass + scan->offset() + itable_index); |
4349 | // } |
4350 | // } |
4351 | Label search, found_method; |
4352 | |
4353 | for (int peel = 1; peel >= 0; peel--) { |
4354 | movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); |
4355 | cmpptr(intf_klass, method_result); |
4356 | |
4357 | if (peel) { |
4358 | jccb(Assembler::equal, found_method); |
4359 | } else { |
4360 | jccb(Assembler::notEqual, search); |
4361 | // (invert the test to fall through to found_method...) |
4362 | } |
4363 | |
4364 | if (!peel) break; |
4365 | |
4366 | bind(search); |
4367 | |
4368 | // Check that the previous entry is non-null. A null entry means that |
4369 | // the receiver class doesn't implement the interface, and wasn't the |
4370 | // same as when the caller was compiled. |
4371 | testptr(method_result, method_result); |
4372 | jcc(Assembler::zero, L_no_such_interface); |
4373 | addptr(scan_temp, scan_step); |
4374 | } |
4375 | |
4376 | bind(found_method); |
4377 | |
4378 | if (return_method) { |
4379 | // Got a hit. |
4380 | movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); |
4381 | movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); |
4382 | } |
4383 | } |
4384 | |
4385 | |
4386 | // virtual method calling |
4387 | void MacroAssembler::lookup_virtual_method(Register recv_klass, |
4388 | RegisterOrConstant vtable_index, |
4389 | Register method_result) { |
4390 | const int base = in_bytes(Klass::vtable_start_offset()); |
4391 | assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below" ); |
4392 | Address vtable_entry_addr(recv_klass, |
4393 | vtable_index, Address::times_ptr, |
4394 | base + vtableEntry::method_offset_in_bytes()); |
4395 | movptr(method_result, vtable_entry_addr); |
4396 | } |
4397 | |
4398 | |
4399 | void MacroAssembler::check_klass_subtype(Register sub_klass, |
4400 | Register super_klass, |
4401 | Register temp_reg, |
4402 | Label& L_success) { |
4403 | Label L_failure; |
4404 | check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); |
4405 | check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); |
4406 | bind(L_failure); |
4407 | } |
4408 | |
4409 | |
4410 | void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, |
4411 | Register super_klass, |
4412 | Register temp_reg, |
4413 | Label* L_success, |
4414 | Label* L_failure, |
4415 | Label* L_slow_path, |
4416 | RegisterOrConstant super_check_offset) { |
4417 | assert_different_registers(sub_klass, super_klass, temp_reg); |
4418 | bool must_load_sco = (super_check_offset.constant_or_zero() == -1); |
4419 | if (super_check_offset.is_register()) { |
4420 | assert_different_registers(sub_klass, super_klass, |
4421 | super_check_offset.as_register()); |
4422 | } else if (must_load_sco) { |
4423 | assert(temp_reg != noreg, "supply either a temp or a register offset" ); |
4424 | } |
4425 | |
4426 | Label L_fallthrough; |
4427 | int label_nulls = 0; |
4428 | if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } |
4429 | if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } |
4430 | if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } |
4431 | assert(label_nulls <= 1, "at most one NULL in the batch" ); |
4432 | |
4433 | int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); |
4434 | int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
4435 | Address super_check_offset_addr(super_klass, sco_offset); |
4436 | |
4437 | // Hacked jcc, which "knows" that L_fallthrough, at least, is in |
4438 | // range of a jccb. If this routine grows larger, reconsider at |
4439 | // least some of these. |
4440 | #define local_jcc(assembler_cond, label) \ |
4441 | if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ |
4442 | else jcc( assembler_cond, label) /*omit semi*/ |
4443 | |
4444 | // Hacked jmp, which may only be used just before L_fallthrough. |
4445 | #define final_jmp(label) \ |
4446 | if (&(label) == &L_fallthrough) { /*do nothing*/ } \ |
4447 | else jmp(label) /*omit semi*/ |
4448 | |
4449 | // If the pointers are equal, we are done (e.g., String[] elements). |
4450 | // This self-check enables sharing of secondary supertype arrays among |
4451 | // non-primary types such as array-of-interface. Otherwise, each such |
4452 | // type would need its own customized SSA. |
4453 | // We move this check to the front of the fast path because many |
4454 | // type checks are in fact trivially successful in this manner, |
4455 | // so we get a nicely predicted branch right at the start of the check. |
4456 | cmpptr(sub_klass, super_klass); |
4457 | local_jcc(Assembler::equal, *L_success); |
4458 | |
4459 | // Check the supertype display: |
4460 | if (must_load_sco) { |
4461 | // Positive movl does right thing on LP64. |
4462 | movl(temp_reg, super_check_offset_addr); |
4463 | super_check_offset = RegisterOrConstant(temp_reg); |
4464 | } |
4465 | Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); |
4466 | cmpptr(super_klass, super_check_addr); // load displayed supertype |
4467 | |
4468 | // This check has worked decisively for primary supers. |
4469 | // Secondary supers are sought in the super_cache ('super_cache_addr'). |
4470 | // (Secondary supers are interfaces and very deeply nested subtypes.) |
4471 | // This works in the same check above because of a tricky aliasing |
4472 | // between the super_cache and the primary super display elements. |
4473 | // (The 'super_check_addr' can address either, as the case requires.) |
4474 | // Note that the cache is updated below if it does not help us find |
4475 | // what we need immediately. |
4476 | // So if it was a primary super, we can just fail immediately. |
4477 | // Otherwise, it's the slow path for us (no success at this point). |
4478 | |
4479 | if (super_check_offset.is_register()) { |
4480 | local_jcc(Assembler::equal, *L_success); |
4481 | cmpl(super_check_offset.as_register(), sc_offset); |
4482 | if (L_failure == &L_fallthrough) { |
4483 | local_jcc(Assembler::equal, *L_slow_path); |
4484 | } else { |
4485 | local_jcc(Assembler::notEqual, *L_failure); |
4486 | final_jmp(*L_slow_path); |
4487 | } |
4488 | } else if (super_check_offset.as_constant() == sc_offset) { |
4489 | // Need a slow path; fast failure is impossible. |
4490 | if (L_slow_path == &L_fallthrough) { |
4491 | local_jcc(Assembler::equal, *L_success); |
4492 | } else { |
4493 | local_jcc(Assembler::notEqual, *L_slow_path); |
4494 | final_jmp(*L_success); |
4495 | } |
4496 | } else { |
4497 | // No slow path; it's a fast decision. |
4498 | if (L_failure == &L_fallthrough) { |
4499 | local_jcc(Assembler::equal, *L_success); |
4500 | } else { |
4501 | local_jcc(Assembler::notEqual, *L_failure); |
4502 | final_jmp(*L_success); |
4503 | } |
4504 | } |
4505 | |
4506 | bind(L_fallthrough); |
4507 | |
4508 | #undef local_jcc |
4509 | #undef final_jmp |
4510 | } |
4511 | |
4512 | |
4513 | void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, |
4514 | Register super_klass, |
4515 | Register temp_reg, |
4516 | Register temp2_reg, |
4517 | Label* L_success, |
4518 | Label* L_failure, |
4519 | bool set_cond_codes) { |
4520 | assert_different_registers(sub_klass, super_klass, temp_reg); |
4521 | if (temp2_reg != noreg) |
4522 | assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); |
4523 | #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) |
4524 | |
4525 | Label L_fallthrough; |
4526 | int label_nulls = 0; |
4527 | if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } |
4528 | if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } |
4529 | assert(label_nulls <= 1, "at most one NULL in the batch" ); |
4530 | |
4531 | // a couple of useful fields in sub_klass: |
4532 | int ss_offset = in_bytes(Klass::secondary_supers_offset()); |
4533 | int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); |
4534 | Address secondary_supers_addr(sub_klass, ss_offset); |
4535 | Address super_cache_addr( sub_klass, sc_offset); |
4536 | |
4537 | // Do a linear scan of the secondary super-klass chain. |
4538 | // This code is rarely used, so simplicity is a virtue here. |
4539 | // The repne_scan instruction uses fixed registers, which we must spill. |
4540 | // Don't worry too much about pre-existing connections with the input regs. |
4541 | |
4542 | assert(sub_klass != rax, "killed reg" ); // killed by mov(rax, super) |
4543 | assert(sub_klass != rcx, "killed reg" ); // killed by lea(rcx, &pst_counter) |
4544 | |
4545 | // Get super_klass value into rax (even if it was in rdi or rcx). |
4546 | bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; |
4547 | if (super_klass != rax || UseCompressedOops) { |
4548 | if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } |
4549 | mov(rax, super_klass); |
4550 | } |
4551 | if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } |
4552 | if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } |
4553 | |
4554 | #ifndef PRODUCT |
4555 | int* pst_counter = &SharedRuntime::_partial_subtype_ctr; |
4556 | ExternalAddress pst_counter_addr((address) pst_counter); |
4557 | NOT_LP64( incrementl(pst_counter_addr) ); |
4558 | LP64_ONLY( lea(rcx, pst_counter_addr) ); |
4559 | LP64_ONLY( incrementl(Address(rcx, 0)) ); |
4560 | #endif //PRODUCT |
4561 | |
4562 | // We will consult the secondary-super array. |
4563 | movptr(rdi, secondary_supers_addr); |
4564 | // Load the array length. (Positive movl does right thing on LP64.) |
4565 | movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); |
4566 | // Skip to start of data. |
4567 | addptr(rdi, Array<Klass*>::base_offset_in_bytes()); |
4568 | |
4569 | // Scan RCX words at [RDI] for an occurrence of RAX. |
4570 | // Set NZ/Z based on last compare. |
4571 | // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does |
4572 | // not change flags (only scas instruction which is repeated sets flags). |
4573 | // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. |
4574 | |
4575 | testptr(rax,rax); // Set Z = 0 |
4576 | repne_scan(); |
4577 | |
4578 | // Unspill the temp. registers: |
4579 | if (pushed_rdi) pop(rdi); |
4580 | if (pushed_rcx) pop(rcx); |
4581 | if (pushed_rax) pop(rax); |
4582 | |
4583 | if (set_cond_codes) { |
4584 | // Special hack for the AD files: rdi is guaranteed non-zero. |
4585 | assert(!pushed_rdi, "rdi must be left non-NULL" ); |
4586 | // Also, the condition codes are properly set Z/NZ on succeed/failure. |
4587 | } |
4588 | |
4589 | if (L_failure == &L_fallthrough) |
4590 | jccb(Assembler::notEqual, *L_failure); |
4591 | else jcc(Assembler::notEqual, *L_failure); |
4592 | |
4593 | // Success. Cache the super we found and proceed in triumph. |
4594 | movptr(super_cache_addr, super_klass); |
4595 | |
4596 | if (L_success != &L_fallthrough) { |
4597 | jmp(*L_success); |
4598 | } |
4599 | |
4600 | #undef IS_A_TEMP |
4601 | |
4602 | bind(L_fallthrough); |
4603 | } |
4604 | |
4605 | void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { |
4606 | assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required" ); |
4607 | |
4608 | Label L_fallthrough; |
4609 | if (L_fast_path == NULL) { |
4610 | L_fast_path = &L_fallthrough; |
4611 | } else if (L_slow_path == NULL) { |
4612 | L_slow_path = &L_fallthrough; |
4613 | } |
4614 | |
4615 | // Fast path check: class is fully initialized |
4616 | cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); |
4617 | jcc(Assembler::equal, *L_fast_path); |
4618 | |
4619 | // Fast path check: current thread is initializer thread |
4620 | cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); |
4621 | if (L_slow_path == &L_fallthrough) { |
4622 | jcc(Assembler::equal, *L_fast_path); |
4623 | bind(*L_slow_path); |
4624 | } else if (L_fast_path == &L_fallthrough) { |
4625 | jcc(Assembler::notEqual, *L_slow_path); |
4626 | bind(*L_fast_path); |
4627 | } else { |
4628 | Unimplemented(); |
4629 | } |
4630 | } |
4631 | |
4632 | void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { |
4633 | if (VM_Version::supports_cmov()) { |
4634 | cmovl(cc, dst, src); |
4635 | } else { |
4636 | Label L; |
4637 | jccb(negate_condition(cc), L); |
4638 | movl(dst, src); |
4639 | bind(L); |
4640 | } |
4641 | } |
4642 | |
4643 | void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { |
4644 | if (VM_Version::supports_cmov()) { |
4645 | cmovl(cc, dst, src); |
4646 | } else { |
4647 | Label L; |
4648 | jccb(negate_condition(cc), L); |
4649 | movl(dst, src); |
4650 | bind(L); |
4651 | } |
4652 | } |
4653 | |
4654 | void MacroAssembler::verify_oop(Register reg, const char* s) { |
4655 | if (!VerifyOops) return; |
4656 | |
4657 | // Pass register number to verify_oop_subroutine |
4658 | const char* b = NULL; |
4659 | { |
4660 | ResourceMark rm; |
4661 | stringStream ss; |
4662 | ss.print("verify_oop: %s: %s" , reg->name(), s); |
4663 | b = code_string(ss.as_string()); |
4664 | } |
4665 | BLOCK_COMMENT("verify_oop {" ); |
4666 | #ifdef _LP64 |
4667 | push(rscratch1); // save r10, trashed by movptr() |
4668 | #endif |
4669 | push(rax); // save rax, |
4670 | push(reg); // pass register argument |
4671 | ExternalAddress buffer((address) b); |
4672 | // avoid using pushptr, as it modifies scratch registers |
4673 | // and our contract is not to modify anything |
4674 | movptr(rax, buffer.addr()); |
4675 | push(rax); |
4676 | // call indirectly to solve generation ordering problem |
4677 | movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); |
4678 | call(rax); |
4679 | // Caller pops the arguments (oop, message) and restores rax, r10 |
4680 | BLOCK_COMMENT("} verify_oop" ); |
4681 | } |
4682 | |
4683 | |
4684 | RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, |
4685 | Register tmp, |
4686 | int offset) { |
4687 | intptr_t value = *delayed_value_addr; |
4688 | if (value != 0) |
4689 | return RegisterOrConstant(value + offset); |
4690 | |
4691 | // load indirectly to solve generation ordering problem |
4692 | movptr(tmp, ExternalAddress((address) delayed_value_addr)); |
4693 | |
4694 | #ifdef ASSERT |
4695 | { Label L; |
4696 | testptr(tmp, tmp); |
4697 | if (WizardMode) { |
4698 | const char* buf = NULL; |
4699 | { |
4700 | ResourceMark rm; |
4701 | stringStream ss; |
4702 | ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]); |
4703 | buf = code_string(ss.as_string()); |
4704 | } |
4705 | jcc(Assembler::notZero, L); |
4706 | STOP(buf); |
4707 | } else { |
4708 | jccb(Assembler::notZero, L); |
4709 | hlt(); |
4710 | } |
4711 | bind(L); |
4712 | } |
4713 | #endif |
4714 | |
4715 | if (offset != 0) |
4716 | addptr(tmp, offset); |
4717 | |
4718 | return RegisterOrConstant(tmp); |
4719 | } |
4720 | |
4721 | |
4722 | Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, |
4723 | int ) { |
4724 | // cf. TemplateTable::prepare_invoke(), if (load_receiver). |
4725 | int stackElementSize = Interpreter::stackElementSize; |
4726 | int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); |
4727 | #ifdef ASSERT |
4728 | int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); |
4729 | assert(offset1 - offset == stackElementSize, "correct arithmetic" ); |
4730 | #endif |
4731 | Register scale_reg = noreg; |
4732 | Address::ScaleFactor scale_factor = Address::no_scale; |
4733 | if (arg_slot.is_constant()) { |
4734 | offset += arg_slot.as_constant() * stackElementSize; |
4735 | } else { |
4736 | scale_reg = arg_slot.as_register(); |
4737 | scale_factor = Address::times(stackElementSize); |
4738 | } |
4739 | offset += wordSize; // return PC is on stack |
4740 | return Address(rsp, scale_reg, scale_factor, offset); |
4741 | } |
4742 | |
4743 | |
4744 | void MacroAssembler::verify_oop_addr(Address addr, const char* s) { |
4745 | if (!VerifyOops) return; |
4746 | |
4747 | // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); |
4748 | // Pass register number to verify_oop_subroutine |
4749 | const char* b = NULL; |
4750 | { |
4751 | ResourceMark rm; |
4752 | stringStream ss; |
4753 | ss.print("verify_oop_addr: %s" , s); |
4754 | b = code_string(ss.as_string()); |
4755 | } |
4756 | #ifdef _LP64 |
4757 | push(rscratch1); // save r10, trashed by movptr() |
4758 | #endif |
4759 | push(rax); // save rax, |
4760 | // addr may contain rsp so we will have to adjust it based on the push |
4761 | // we just did (and on 64 bit we do two pushes) |
4762 | // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which |
4763 | // stores rax into addr which is backwards of what was intended. |
4764 | if (addr.uses(rsp)) { |
4765 | lea(rax, addr); |
4766 | pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); |
4767 | } else { |
4768 | pushptr(addr); |
4769 | } |
4770 | |
4771 | ExternalAddress buffer((address) b); |
4772 | // pass msg argument |
4773 | // avoid using pushptr, as it modifies scratch registers |
4774 | // and our contract is not to modify anything |
4775 | movptr(rax, buffer.addr()); |
4776 | push(rax); |
4777 | |
4778 | // call indirectly to solve generation ordering problem |
4779 | movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); |
4780 | call(rax); |
4781 | // Caller pops the arguments (addr, message) and restores rax, r10. |
4782 | } |
4783 | |
4784 | void MacroAssembler::verify_tlab() { |
4785 | #ifdef ASSERT |
4786 | if (UseTLAB && VerifyOops) { |
4787 | Label next, ok; |
4788 | Register t1 = rsi; |
4789 | Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); |
4790 | |
4791 | push(t1); |
4792 | NOT_LP64(push(thread_reg)); |
4793 | NOT_LP64(get_thread(thread_reg)); |
4794 | |
4795 | movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); |
4796 | cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); |
4797 | jcc(Assembler::aboveEqual, next); |
4798 | STOP("assert(top >= start)" ); |
4799 | should_not_reach_here(); |
4800 | |
4801 | bind(next); |
4802 | movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); |
4803 | cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); |
4804 | jcc(Assembler::aboveEqual, ok); |
4805 | STOP("assert(top <= end)" ); |
4806 | should_not_reach_here(); |
4807 | |
4808 | bind(ok); |
4809 | NOT_LP64(pop(thread_reg)); |
4810 | pop(t1); |
4811 | } |
4812 | #endif |
4813 | } |
4814 | |
4815 | class ControlWord { |
4816 | public: |
4817 | int32_t _value; |
4818 | |
4819 | int rounding_control() const { return (_value >> 10) & 3 ; } |
4820 | int precision_control() const { return (_value >> 8) & 3 ; } |
4821 | bool precision() const { return ((_value >> 5) & 1) != 0; } |
4822 | bool underflow() const { return ((_value >> 4) & 1) != 0; } |
4823 | bool overflow() const { return ((_value >> 3) & 1) != 0; } |
4824 | bool zero_divide() const { return ((_value >> 2) & 1) != 0; } |
4825 | bool denormalized() const { return ((_value >> 1) & 1) != 0; } |
4826 | bool invalid() const { return ((_value >> 0) & 1) != 0; } |
4827 | |
4828 | void print() const { |
4829 | // rounding control |
4830 | const char* rc; |
4831 | switch (rounding_control()) { |
4832 | case 0: rc = "round near" ; break; |
4833 | case 1: rc = "round down" ; break; |
4834 | case 2: rc = "round up " ; break; |
4835 | case 3: rc = "chop " ; break; |
4836 | }; |
4837 | // precision control |
4838 | const char* pc; |
4839 | switch (precision_control()) { |
4840 | case 0: pc = "24 bits " ; break; |
4841 | case 1: pc = "reserved" ; break; |
4842 | case 2: pc = "53 bits " ; break; |
4843 | case 3: pc = "64 bits " ; break; |
4844 | }; |
4845 | // flags |
4846 | char f[9]; |
4847 | f[0] = ' '; |
4848 | f[1] = ' '; |
4849 | f[2] = (precision ()) ? 'P' : 'p'; |
4850 | f[3] = (underflow ()) ? 'U' : 'u'; |
4851 | f[4] = (overflow ()) ? 'O' : 'o'; |
4852 | f[5] = (zero_divide ()) ? 'Z' : 'z'; |
4853 | f[6] = (denormalized()) ? 'D' : 'd'; |
4854 | f[7] = (invalid ()) ? 'I' : 'i'; |
4855 | f[8] = '\x0'; |
4856 | // output |
4857 | printf("%04x masks = %s, %s, %s" , _value & 0xFFFF, f, rc, pc); |
4858 | } |
4859 | |
4860 | }; |
4861 | |
4862 | class StatusWord { |
4863 | public: |
4864 | int32_t _value; |
4865 | |
4866 | bool busy() const { return ((_value >> 15) & 1) != 0; } |
4867 | bool C3() const { return ((_value >> 14) & 1) != 0; } |
4868 | bool C2() const { return ((_value >> 10) & 1) != 0; } |
4869 | bool C1() const { return ((_value >> 9) & 1) != 0; } |
4870 | bool C0() const { return ((_value >> 8) & 1) != 0; } |
4871 | int top() const { return (_value >> 11) & 7 ; } |
4872 | bool error_status() const { return ((_value >> 7) & 1) != 0; } |
4873 | bool stack_fault() const { return ((_value >> 6) & 1) != 0; } |
4874 | bool precision() const { return ((_value >> 5) & 1) != 0; } |
4875 | bool underflow() const { return ((_value >> 4) & 1) != 0; } |
4876 | bool overflow() const { return ((_value >> 3) & 1) != 0; } |
4877 | bool zero_divide() const { return ((_value >> 2) & 1) != 0; } |
4878 | bool denormalized() const { return ((_value >> 1) & 1) != 0; } |
4879 | bool invalid() const { return ((_value >> 0) & 1) != 0; } |
4880 | |
4881 | void print() const { |
4882 | // condition codes |
4883 | char c[5]; |
4884 | c[0] = (C3()) ? '3' : '-'; |
4885 | c[1] = (C2()) ? '2' : '-'; |
4886 | c[2] = (C1()) ? '1' : '-'; |
4887 | c[3] = (C0()) ? '0' : '-'; |
4888 | c[4] = '\x0'; |
4889 | // flags |
4890 | char f[9]; |
4891 | f[0] = (error_status()) ? 'E' : '-'; |
4892 | f[1] = (stack_fault ()) ? 'S' : '-'; |
4893 | f[2] = (precision ()) ? 'P' : '-'; |
4894 | f[3] = (underflow ()) ? 'U' : '-'; |
4895 | f[4] = (overflow ()) ? 'O' : '-'; |
4896 | f[5] = (zero_divide ()) ? 'Z' : '-'; |
4897 | f[6] = (denormalized()) ? 'D' : '-'; |
4898 | f[7] = (invalid ()) ? 'I' : '-'; |
4899 | f[8] = '\x0'; |
4900 | // output |
4901 | printf("%04x flags = %s, cc = %s, top = %d" , _value & 0xFFFF, f, c, top()); |
4902 | } |
4903 | |
4904 | }; |
4905 | |
4906 | class TagWord { |
4907 | public: |
4908 | int32_t _value; |
4909 | |
4910 | int tag_at(int i) const { return (_value >> (i*2)) & 3; } |
4911 | |
4912 | void print() const { |
4913 | printf("%04x" , _value & 0xFFFF); |
4914 | } |
4915 | |
4916 | }; |
4917 | |
4918 | class FPU_Register { |
4919 | public: |
4920 | int32_t _m0; |
4921 | int32_t _m1; |
4922 | int16_t _ex; |
4923 | |
4924 | bool is_indefinite() const { |
4925 | return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; |
4926 | } |
4927 | |
4928 | void print() const { |
4929 | char sign = (_ex < 0) ? '-' : '+'; |
4930 | const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " " ; |
4931 | printf("%c%04hx.%08x%08x %s" , sign, _ex, _m1, _m0, kind); |
4932 | }; |
4933 | |
4934 | }; |
4935 | |
4936 | class FPU_State { |
4937 | public: |
4938 | enum { |
4939 | register_size = 10, |
4940 | number_of_registers = 8, |
4941 | register_mask = 7 |
4942 | }; |
4943 | |
4944 | ControlWord _control_word; |
4945 | StatusWord _status_word; |
4946 | TagWord _tag_word; |
4947 | int32_t _error_offset; |
4948 | int32_t _error_selector; |
4949 | int32_t _data_offset; |
4950 | int32_t _data_selector; |
4951 | int8_t _register[register_size * number_of_registers]; |
4952 | |
4953 | int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } |
4954 | FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } |
4955 | |
4956 | const char* tag_as_string(int tag) const { |
4957 | switch (tag) { |
4958 | case 0: return "valid" ; |
4959 | case 1: return "zero" ; |
4960 | case 2: return "special" ; |
4961 | case 3: return "empty" ; |
4962 | } |
4963 | ShouldNotReachHere(); |
4964 | return NULL; |
4965 | } |
4966 | |
4967 | void print() const { |
4968 | // print computation registers |
4969 | { int t = _status_word.top(); |
4970 | for (int i = 0; i < number_of_registers; i++) { |
4971 | int j = (i - t) & register_mask; |
4972 | printf("%c r%d = ST%d = " , (j == 0 ? '*' : ' '), i, j); |
4973 | st(j)->print(); |
4974 | printf(" %s\n" , tag_as_string(_tag_word.tag_at(i))); |
4975 | } |
4976 | } |
4977 | printf("\n" ); |
4978 | // print control registers |
4979 | printf("ctrl = " ); _control_word.print(); printf("\n" ); |
4980 | printf("stat = " ); _status_word .print(); printf("\n" ); |
4981 | printf("tags = " ); _tag_word .print(); printf("\n" ); |
4982 | } |
4983 | |
4984 | }; |
4985 | |
4986 | class Flag_Register { |
4987 | public: |
4988 | int32_t _value; |
4989 | |
4990 | bool overflow() const { return ((_value >> 11) & 1) != 0; } |
4991 | bool direction() const { return ((_value >> 10) & 1) != 0; } |
4992 | bool sign() const { return ((_value >> 7) & 1) != 0; } |
4993 | bool zero() const { return ((_value >> 6) & 1) != 0; } |
4994 | bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } |
4995 | bool parity() const { return ((_value >> 2) & 1) != 0; } |
4996 | bool carry() const { return ((_value >> 0) & 1) != 0; } |
4997 | |
4998 | void print() const { |
4999 | // flags |
5000 | char f[8]; |
5001 | f[0] = (overflow ()) ? 'O' : '-'; |
5002 | f[1] = (direction ()) ? 'D' : '-'; |
5003 | f[2] = (sign ()) ? 'S' : '-'; |
5004 | f[3] = (zero ()) ? 'Z' : '-'; |
5005 | f[4] = (auxiliary_carry()) ? 'A' : '-'; |
5006 | f[5] = (parity ()) ? 'P' : '-'; |
5007 | f[6] = (carry ()) ? 'C' : '-'; |
5008 | f[7] = '\x0'; |
5009 | // output |
5010 | printf("%08x flags = %s" , _value, f); |
5011 | } |
5012 | |
5013 | }; |
5014 | |
5015 | class IU_Register { |
5016 | public: |
5017 | int32_t _value; |
5018 | |
5019 | void print() const { |
5020 | printf("%08x %11d" , _value, _value); |
5021 | } |
5022 | |
5023 | }; |
5024 | |
5025 | class IU_State { |
5026 | public: |
5027 | Flag_Register _eflags; |
5028 | IU_Register _rdi; |
5029 | IU_Register _rsi; |
5030 | IU_Register _rbp; |
5031 | IU_Register _rsp; |
5032 | IU_Register _rbx; |
5033 | IU_Register _rdx; |
5034 | IU_Register _rcx; |
5035 | IU_Register _rax; |
5036 | |
5037 | void print() const { |
5038 | // computation registers |
5039 | printf("rax, = " ); _rax.print(); printf("\n" ); |
5040 | printf("rbx, = " ); _rbx.print(); printf("\n" ); |
5041 | printf("rcx = " ); _rcx.print(); printf("\n" ); |
5042 | printf("rdx = " ); _rdx.print(); printf("\n" ); |
5043 | printf("rdi = " ); _rdi.print(); printf("\n" ); |
5044 | printf("rsi = " ); _rsi.print(); printf("\n" ); |
5045 | printf("rbp, = " ); _rbp.print(); printf("\n" ); |
5046 | printf("rsp = " ); _rsp.print(); printf("\n" ); |
5047 | printf("\n" ); |
5048 | // control registers |
5049 | printf("flgs = " ); _eflags.print(); printf("\n" ); |
5050 | } |
5051 | }; |
5052 | |
5053 | |
5054 | class CPU_State { |
5055 | public: |
5056 | FPU_State _fpu_state; |
5057 | IU_State _iu_state; |
5058 | |
5059 | void print() const { |
5060 | printf("--------------------------------------------------\n" ); |
5061 | _iu_state .print(); |
5062 | printf("\n" ); |
5063 | _fpu_state.print(); |
5064 | printf("--------------------------------------------------\n" ); |
5065 | } |
5066 | |
5067 | }; |
5068 | |
5069 | |
5070 | static void _print_CPU_state(CPU_State* state) { |
5071 | state->print(); |
5072 | }; |
5073 | |
5074 | |
5075 | void MacroAssembler::print_CPU_state() { |
5076 | push_CPU_state(); |
5077 | push(rsp); // pass CPU state |
5078 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); |
5079 | addptr(rsp, wordSize); // discard argument |
5080 | pop_CPU_state(); |
5081 | } |
5082 | |
5083 | |
5084 | static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { |
5085 | static int counter = 0; |
5086 | FPU_State* fs = &state->_fpu_state; |
5087 | counter++; |
5088 | // For leaf calls, only verify that the top few elements remain empty. |
5089 | // We only need 1 empty at the top for C2 code. |
5090 | if( stack_depth < 0 ) { |
5091 | if( fs->tag_for_st(7) != 3 ) { |
5092 | printf("FPR7 not empty\n" ); |
5093 | state->print(); |
5094 | assert(false, "error" ); |
5095 | return false; |
5096 | } |
5097 | return true; // All other stack states do not matter |
5098 | } |
5099 | |
5100 | assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std, |
5101 | "bad FPU control word" ); |
5102 | |
5103 | // compute stack depth |
5104 | int i = 0; |
5105 | while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; |
5106 | int d = i; |
5107 | while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; |
5108 | // verify findings |
5109 | if (i != FPU_State::number_of_registers) { |
5110 | // stack not contiguous |
5111 | printf("%s: stack not contiguous at ST%d\n" , s, i); |
5112 | state->print(); |
5113 | assert(false, "error" ); |
5114 | return false; |
5115 | } |
5116 | // check if computed stack depth corresponds to expected stack depth |
5117 | if (stack_depth < 0) { |
5118 | // expected stack depth is -stack_depth or less |
5119 | if (d > -stack_depth) { |
5120 | // too many elements on the stack |
5121 | printf("%s: <= %d stack elements expected but found %d\n" , s, -stack_depth, d); |
5122 | state->print(); |
5123 | assert(false, "error" ); |
5124 | return false; |
5125 | } |
5126 | } else { |
5127 | // expected stack depth is stack_depth |
5128 | if (d != stack_depth) { |
5129 | // wrong stack depth |
5130 | printf("%s: %d stack elements expected but found %d\n" , s, stack_depth, d); |
5131 | state->print(); |
5132 | assert(false, "error" ); |
5133 | return false; |
5134 | } |
5135 | } |
5136 | // everything is cool |
5137 | return true; |
5138 | } |
5139 | |
5140 | |
5141 | void MacroAssembler::verify_FPU(int stack_depth, const char* s) { |
5142 | if (!VerifyFPU) return; |
5143 | push_CPU_state(); |
5144 | push(rsp); // pass CPU state |
5145 | ExternalAddress msg((address) s); |
5146 | // pass message string s |
5147 | pushptr(msg.addr()); |
5148 | push(stack_depth); // pass stack depth |
5149 | call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); |
5150 | addptr(rsp, 3 * wordSize); // discard arguments |
5151 | // check for error |
5152 | { Label L; |
5153 | testl(rax, rax); |
5154 | jcc(Assembler::notZero, L); |
5155 | int3(); // break if error condition |
5156 | bind(L); |
5157 | } |
5158 | pop_CPU_state(); |
5159 | } |
5160 | |
5161 | void MacroAssembler::restore_cpu_control_state_after_jni() { |
5162 | // Either restore the MXCSR register after returning from the JNI Call |
5163 | // or verify that it wasn't changed (with -Xcheck:jni flag). |
5164 | if (VM_Version::supports_sse()) { |
5165 | if (RestoreMXCSROnJNICalls) { |
5166 | ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); |
5167 | } else if (CheckJNICalls) { |
5168 | call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); |
5169 | } |
5170 | } |
5171 | // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. |
5172 | vzeroupper(); |
5173 | // Reset k1 to 0xffff. |
5174 | |
5175 | #ifdef COMPILER2 |
5176 | if (PostLoopMultiversioning && VM_Version::supports_evex()) { |
5177 | push(rcx); |
5178 | movl(rcx, 0xffff); |
5179 | kmovwl(k1, rcx); |
5180 | pop(rcx); |
5181 | } |
5182 | #endif // COMPILER2 |
5183 | |
5184 | #ifndef _LP64 |
5185 | // Either restore the x87 floating pointer control word after returning |
5186 | // from the JNI call or verify that it wasn't changed. |
5187 | if (CheckJNICalls) { |
5188 | call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); |
5189 | } |
5190 | #endif // _LP64 |
5191 | } |
5192 | |
5193 | // ((OopHandle)result).resolve(); |
5194 | void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { |
5195 | assert_different_registers(result, tmp); |
5196 | |
5197 | // Only 64 bit platforms support GCs that require a tmp register |
5198 | // Only IN_HEAP loads require a thread_tmp register |
5199 | // OopHandle::resolve is an indirection like jobject. |
5200 | access_load_at(T_OBJECT, IN_NATIVE, |
5201 | result, Address(result, 0), tmp, /*tmp_thread*/noreg); |
5202 | } |
5203 | |
5204 | // ((WeakHandle)result).resolve(); |
5205 | void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { |
5206 | assert_different_registers(rresult, rtmp); |
5207 | Label resolved; |
5208 | |
5209 | // A null weak handle resolves to null. |
5210 | cmpptr(rresult, 0); |
5211 | jcc(Assembler::equal, resolved); |
5212 | |
5213 | // Only 64 bit platforms support GCs that require a tmp register |
5214 | // Only IN_HEAP loads require a thread_tmp register |
5215 | // WeakHandle::resolve is an indirection like jweak. |
5216 | access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, |
5217 | rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); |
5218 | bind(resolved); |
5219 | } |
5220 | |
5221 | void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { |
5222 | // get mirror |
5223 | const int mirror_offset = in_bytes(Klass::java_mirror_offset()); |
5224 | load_method_holder(mirror, method); |
5225 | movptr(mirror, Address(mirror, mirror_offset)); |
5226 | resolve_oop_handle(mirror, tmp); |
5227 | } |
5228 | |
5229 | void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { |
5230 | load_method_holder(rresult, rmethod); |
5231 | movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); |
5232 | } |
5233 | |
5234 | void MacroAssembler::load_method_holder(Register holder, Register method) { |
5235 | movptr(holder, Address(method, Method::const_offset())); // ConstMethod* |
5236 | movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* |
5237 | movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* |
5238 | } |
5239 | |
5240 | void MacroAssembler::load_klass(Register dst, Register src) { |
5241 | #ifdef _LP64 |
5242 | if (UseCompressedClassPointers) { |
5243 | movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); |
5244 | decode_klass_not_null(dst); |
5245 | } else |
5246 | #endif |
5247 | movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); |
5248 | } |
5249 | |
5250 | void MacroAssembler::(Register dst, Register src) { |
5251 | load_klass(dst, src); |
5252 | movptr(dst, Address(dst, Klass::prototype_header_offset())); |
5253 | } |
5254 | |
5255 | void MacroAssembler::store_klass(Register dst, Register src) { |
5256 | #ifdef _LP64 |
5257 | if (UseCompressedClassPointers) { |
5258 | encode_klass_not_null(src); |
5259 | movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); |
5260 | } else |
5261 | #endif |
5262 | movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); |
5263 | } |
5264 | |
5265 | void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, |
5266 | Register tmp1, Register thread_tmp) { |
5267 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
5268 | decorators = AccessInternal::decorator_fixup(decorators); |
5269 | bool as_raw = (decorators & AS_RAW) != 0; |
5270 | if (as_raw) { |
5271 | bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); |
5272 | } else { |
5273 | bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); |
5274 | } |
5275 | } |
5276 | |
5277 | void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src, |
5278 | Register tmp1, Register tmp2) { |
5279 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
5280 | decorators = AccessInternal::decorator_fixup(decorators); |
5281 | bool as_raw = (decorators & AS_RAW) != 0; |
5282 | if (as_raw) { |
5283 | bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2); |
5284 | } else { |
5285 | bs->store_at(this, decorators, type, dst, src, tmp1, tmp2); |
5286 | } |
5287 | } |
5288 | |
5289 | void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { |
5290 | // Use stronger ACCESS_WRITE|ACCESS_READ by default. |
5291 | if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { |
5292 | decorators |= ACCESS_READ | ACCESS_WRITE; |
5293 | } |
5294 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
5295 | return bs->resolve(this, decorators, obj); |
5296 | } |
5297 | |
5298 | void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, |
5299 | Register thread_tmp, DecoratorSet decorators) { |
5300 | access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); |
5301 | } |
5302 | |
5303 | // Doesn't do verfication, generates fixed size code |
5304 | void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, |
5305 | Register thread_tmp, DecoratorSet decorators) { |
5306 | access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); |
5307 | } |
5308 | |
5309 | void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, |
5310 | Register tmp2, DecoratorSet decorators) { |
5311 | access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2); |
5312 | } |
5313 | |
5314 | // Used for storing NULLs. |
5315 | void MacroAssembler::store_heap_oop_null(Address dst) { |
5316 | access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); |
5317 | } |
5318 | |
5319 | #ifdef _LP64 |
5320 | void MacroAssembler::store_klass_gap(Register dst, Register src) { |
5321 | if (UseCompressedClassPointers) { |
5322 | // Store to klass gap in destination |
5323 | movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); |
5324 | } |
5325 | } |
5326 | |
5327 | #ifdef ASSERT |
5328 | void MacroAssembler::verify_heapbase(const char* msg) { |
5329 | assert (UseCompressedOops, "should be compressed" ); |
5330 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5331 | if (CheckCompressedOops) { |
5332 | Label ok; |
5333 | push(rscratch1); // cmpptr trashes rscratch1 |
5334 | cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); |
5335 | jcc(Assembler::equal, ok); |
5336 | STOP(msg); |
5337 | bind(ok); |
5338 | pop(rscratch1); |
5339 | } |
5340 | } |
5341 | #endif |
5342 | |
5343 | // Algorithm must match oop.inline.hpp encode_heap_oop. |
5344 | void MacroAssembler::encode_heap_oop(Register r) { |
5345 | #ifdef ASSERT |
5346 | verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?" ); |
5347 | #endif |
5348 | verify_oop(r, "broken oop in encode_heap_oop" ); |
5349 | if (CompressedOops::base() == NULL) { |
5350 | if (CompressedOops::shift() != 0) { |
5351 | assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5352 | shrq(r, LogMinObjAlignmentInBytes); |
5353 | } |
5354 | return; |
5355 | } |
5356 | testq(r, r); |
5357 | cmovq(Assembler::equal, r, r12_heapbase); |
5358 | subq(r, r12_heapbase); |
5359 | shrq(r, LogMinObjAlignmentInBytes); |
5360 | } |
5361 | |
5362 | void MacroAssembler::encode_heap_oop_not_null(Register r) { |
5363 | #ifdef ASSERT |
5364 | verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?" ); |
5365 | if (CheckCompressedOops) { |
5366 | Label ok; |
5367 | testq(r, r); |
5368 | jcc(Assembler::notEqual, ok); |
5369 | STOP("null oop passed to encode_heap_oop_not_null" ); |
5370 | bind(ok); |
5371 | } |
5372 | #endif |
5373 | verify_oop(r, "broken oop in encode_heap_oop_not_null" ); |
5374 | if (CompressedOops::base() != NULL) { |
5375 | subq(r, r12_heapbase); |
5376 | } |
5377 | if (CompressedOops::shift() != 0) { |
5378 | assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5379 | shrq(r, LogMinObjAlignmentInBytes); |
5380 | } |
5381 | } |
5382 | |
5383 | void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { |
5384 | #ifdef ASSERT |
5385 | verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?" ); |
5386 | if (CheckCompressedOops) { |
5387 | Label ok; |
5388 | testq(src, src); |
5389 | jcc(Assembler::notEqual, ok); |
5390 | STOP("null oop passed to encode_heap_oop_not_null2" ); |
5391 | bind(ok); |
5392 | } |
5393 | #endif |
5394 | verify_oop(src, "broken oop in encode_heap_oop_not_null2" ); |
5395 | if (dst != src) { |
5396 | movq(dst, src); |
5397 | } |
5398 | if (CompressedOops::base() != NULL) { |
5399 | subq(dst, r12_heapbase); |
5400 | } |
5401 | if (CompressedOops::shift() != 0) { |
5402 | assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5403 | shrq(dst, LogMinObjAlignmentInBytes); |
5404 | } |
5405 | } |
5406 | |
5407 | void MacroAssembler::decode_heap_oop(Register r) { |
5408 | #ifdef ASSERT |
5409 | verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?" ); |
5410 | #endif |
5411 | if (CompressedOops::base() == NULL) { |
5412 | if (CompressedOops::shift() != 0) { |
5413 | assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5414 | shlq(r, LogMinObjAlignmentInBytes); |
5415 | } |
5416 | } else { |
5417 | Label done; |
5418 | shlq(r, LogMinObjAlignmentInBytes); |
5419 | jccb(Assembler::equal, done); |
5420 | addq(r, r12_heapbase); |
5421 | bind(done); |
5422 | } |
5423 | verify_oop(r, "broken oop in decode_heap_oop" ); |
5424 | } |
5425 | |
5426 | void MacroAssembler::decode_heap_oop_not_null(Register r) { |
5427 | // Note: it will change flags |
5428 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5429 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5430 | // Cannot assert, unverified entry point counts instructions (see .ad file) |
5431 | // vtableStubs also counts instructions in pd_code_size_limit. |
5432 | // Also do not verify_oop as this is called by verify_oop. |
5433 | if (CompressedOops::shift() != 0) { |
5434 | assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5435 | shlq(r, LogMinObjAlignmentInBytes); |
5436 | if (CompressedOops::base() != NULL) { |
5437 | addq(r, r12_heapbase); |
5438 | } |
5439 | } else { |
5440 | assert (CompressedOops::base() == NULL, "sanity" ); |
5441 | } |
5442 | } |
5443 | |
5444 | void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { |
5445 | // Note: it will change flags |
5446 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5447 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5448 | // Cannot assert, unverified entry point counts instructions (see .ad file) |
5449 | // vtableStubs also counts instructions in pd_code_size_limit. |
5450 | // Also do not verify_oop as this is called by verify_oop. |
5451 | if (CompressedOops::shift() != 0) { |
5452 | assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong" ); |
5453 | if (LogMinObjAlignmentInBytes == Address::times_8) { |
5454 | leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); |
5455 | } else { |
5456 | if (dst != src) { |
5457 | movq(dst, src); |
5458 | } |
5459 | shlq(dst, LogMinObjAlignmentInBytes); |
5460 | if (CompressedOops::base() != NULL) { |
5461 | addq(dst, r12_heapbase); |
5462 | } |
5463 | } |
5464 | } else { |
5465 | assert (CompressedOops::base() == NULL, "sanity" ); |
5466 | if (dst != src) { |
5467 | movq(dst, src); |
5468 | } |
5469 | } |
5470 | } |
5471 | |
5472 | void MacroAssembler::encode_klass_not_null(Register r) { |
5473 | if (CompressedKlassPointers::base() != NULL) { |
5474 | // Use r12 as a scratch register in which to temporarily load the narrow_klass_base. |
5475 | assert(r != r12_heapbase, "Encoding a klass in r12" ); |
5476 | mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base()); |
5477 | subq(r, r12_heapbase); |
5478 | } |
5479 | if (CompressedKlassPointers::shift() != 0) { |
5480 | assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong" ); |
5481 | shrq(r, LogKlassAlignmentInBytes); |
5482 | } |
5483 | if (CompressedKlassPointers::base() != NULL) { |
5484 | reinit_heapbase(); |
5485 | } |
5486 | } |
5487 | |
5488 | void MacroAssembler::encode_klass_not_null(Register dst, Register src) { |
5489 | if (dst == src) { |
5490 | encode_klass_not_null(src); |
5491 | } else { |
5492 | if (CompressedKlassPointers::base() != NULL) { |
5493 | mov64(dst, (int64_t)CompressedKlassPointers::base()); |
5494 | negq(dst); |
5495 | addq(dst, src); |
5496 | } else { |
5497 | movptr(dst, src); |
5498 | } |
5499 | if (CompressedKlassPointers::shift() != 0) { |
5500 | assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong" ); |
5501 | shrq(dst, LogKlassAlignmentInBytes); |
5502 | } |
5503 | } |
5504 | } |
5505 | |
5506 | // Function instr_size_for_decode_klass_not_null() counts the instructions |
5507 | // generated by decode_klass_not_null(register r) and reinit_heapbase(), |
5508 | // when (Universe::heap() != NULL). Hence, if the instructions they |
5509 | // generate change, then this method needs to be updated. |
5510 | int MacroAssembler::instr_size_for_decode_klass_not_null() { |
5511 | assert (UseCompressedClassPointers, "only for compressed klass ptrs" ); |
5512 | if (CompressedKlassPointers::base() != NULL) { |
5513 | // mov64 + addq + shlq? + mov64 (for reinit_heapbase()). |
5514 | return (CompressedKlassPointers::shift() == 0 ? 20 : 24); |
5515 | } else { |
5516 | // longest load decode klass function, mov64, leaq |
5517 | return 16; |
5518 | } |
5519 | } |
5520 | |
5521 | // !!! If the instructions that get generated here change then function |
5522 | // instr_size_for_decode_klass_not_null() needs to get updated. |
5523 | void MacroAssembler::decode_klass_not_null(Register r) { |
5524 | // Note: it will change flags |
5525 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5526 | assert(r != r12_heapbase, "Decoding a klass in r12" ); |
5527 | // Cannot assert, unverified entry point counts instructions (see .ad file) |
5528 | // vtableStubs also counts instructions in pd_code_size_limit. |
5529 | // Also do not verify_oop as this is called by verify_oop. |
5530 | if (CompressedKlassPointers::shift() != 0) { |
5531 | assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong" ); |
5532 | shlq(r, LogKlassAlignmentInBytes); |
5533 | } |
5534 | // Use r12 as a scratch register in which to temporarily load the narrow_klass_base. |
5535 | if (CompressedKlassPointers::base() != NULL) { |
5536 | mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base()); |
5537 | addq(r, r12_heapbase); |
5538 | reinit_heapbase(); |
5539 | } |
5540 | } |
5541 | |
5542 | void MacroAssembler::decode_klass_not_null(Register dst, Register src) { |
5543 | // Note: it will change flags |
5544 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5545 | if (dst == src) { |
5546 | decode_klass_not_null(dst); |
5547 | } else { |
5548 | // Cannot assert, unverified entry point counts instructions (see .ad file) |
5549 | // vtableStubs also counts instructions in pd_code_size_limit. |
5550 | // Also do not verify_oop as this is called by verify_oop. |
5551 | mov64(dst, (int64_t)CompressedKlassPointers::base()); |
5552 | if (CompressedKlassPointers::shift() != 0) { |
5553 | assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong" ); |
5554 | assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?" ); |
5555 | leaq(dst, Address(dst, src, Address::times_8, 0)); |
5556 | } else { |
5557 | addq(dst, src); |
5558 | } |
5559 | } |
5560 | } |
5561 | |
5562 | void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { |
5563 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5564 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5565 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5566 | int oop_index = oop_recorder()->find_index(obj); |
5567 | RelocationHolder rspec = oop_Relocation::spec(oop_index); |
5568 | mov_narrow_oop(dst, oop_index, rspec); |
5569 | } |
5570 | |
5571 | void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { |
5572 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5573 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5574 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5575 | int oop_index = oop_recorder()->find_index(obj); |
5576 | RelocationHolder rspec = oop_Relocation::spec(oop_index); |
5577 | mov_narrow_oop(dst, oop_index, rspec); |
5578 | } |
5579 | |
5580 | void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { |
5581 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5582 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5583 | int klass_index = oop_recorder()->find_index(k); |
5584 | RelocationHolder rspec = metadata_Relocation::spec(klass_index); |
5585 | mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); |
5586 | } |
5587 | |
5588 | void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { |
5589 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5590 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5591 | int klass_index = oop_recorder()->find_index(k); |
5592 | RelocationHolder rspec = metadata_Relocation::spec(klass_index); |
5593 | mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); |
5594 | } |
5595 | |
5596 | void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { |
5597 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5598 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5599 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5600 | int oop_index = oop_recorder()->find_index(obj); |
5601 | RelocationHolder rspec = oop_Relocation::spec(oop_index); |
5602 | Assembler::cmp_narrow_oop(dst, oop_index, rspec); |
5603 | } |
5604 | |
5605 | void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { |
5606 | assert (UseCompressedOops, "should only be used for compressed headers" ); |
5607 | assert (Universe::heap() != NULL, "java heap should be initialized" ); |
5608 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5609 | int oop_index = oop_recorder()->find_index(obj); |
5610 | RelocationHolder rspec = oop_Relocation::spec(oop_index); |
5611 | Assembler::cmp_narrow_oop(dst, oop_index, rspec); |
5612 | } |
5613 | |
5614 | void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { |
5615 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5616 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5617 | int klass_index = oop_recorder()->find_index(k); |
5618 | RelocationHolder rspec = metadata_Relocation::spec(klass_index); |
5619 | Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); |
5620 | } |
5621 | |
5622 | void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { |
5623 | assert (UseCompressedClassPointers, "should only be used for compressed headers" ); |
5624 | assert (oop_recorder() != NULL, "this assembler needs an OopRecorder" ); |
5625 | int klass_index = oop_recorder()->find_index(k); |
5626 | RelocationHolder rspec = metadata_Relocation::spec(klass_index); |
5627 | Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); |
5628 | } |
5629 | |
5630 | void MacroAssembler::reinit_heapbase() { |
5631 | if (UseCompressedOops || UseCompressedClassPointers) { |
5632 | if (Universe::heap() != NULL) { |
5633 | if (CompressedOops::base() == NULL) { |
5634 | MacroAssembler::xorptr(r12_heapbase, r12_heapbase); |
5635 | } else { |
5636 | mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base()); |
5637 | } |
5638 | } else { |
5639 | movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); |
5640 | } |
5641 | } |
5642 | } |
5643 | |
5644 | #endif // _LP64 |
5645 | |
5646 | // C2 compiled method's prolog code. |
5647 | void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { |
5648 | |
5649 | // WARNING: Initial instruction MUST be 5 bytes or longer so that |
5650 | // NativeJump::patch_verified_entry will be able to patch out the entry |
5651 | // code safely. The push to verify stack depth is ok at 5 bytes, |
5652 | // the frame allocation can be either 3 or 6 bytes. So if we don't do |
5653 | // stack bang then we must use the 6 byte frame allocation even if |
5654 | // we have no frame. :-( |
5655 | assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect" ); |
5656 | |
5657 | assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned" ); |
5658 | // Remove word for return addr |
5659 | framesize -= wordSize; |
5660 | stack_bang_size -= wordSize; |
5661 | |
5662 | // Calls to C2R adapters often do not accept exceptional returns. |
5663 | // We require that their callers must bang for them. But be careful, because |
5664 | // some VM calls (such as call site linkage) can use several kilobytes of |
5665 | // stack. But the stack safety zone should account for that. |
5666 | // See bugs 4446381, 4468289, 4497237. |
5667 | if (stack_bang_size > 0) { |
5668 | generate_stack_overflow_check(stack_bang_size); |
5669 | |
5670 | // We always push rbp, so that on return to interpreter rbp, will be |
5671 | // restored correctly and we can correct the stack. |
5672 | push(rbp); |
5673 | // Save caller's stack pointer into RBP if the frame pointer is preserved. |
5674 | if (PreserveFramePointer) { |
5675 | mov(rbp, rsp); |
5676 | } |
5677 | // Remove word for ebp |
5678 | framesize -= wordSize; |
5679 | |
5680 | // Create frame |
5681 | if (framesize) { |
5682 | subptr(rsp, framesize); |
5683 | } |
5684 | } else { |
5685 | // Create frame (force generation of a 4 byte immediate value) |
5686 | subptr_imm32(rsp, framesize); |
5687 | |
5688 | // Save RBP register now. |
5689 | framesize -= wordSize; |
5690 | movptr(Address(rsp, framesize), rbp); |
5691 | // Save caller's stack pointer into RBP if the frame pointer is preserved. |
5692 | if (PreserveFramePointer) { |
5693 | movptr(rbp, rsp); |
5694 | if (framesize > 0) { |
5695 | addptr(rbp, framesize); |
5696 | } |
5697 | } |
5698 | } |
5699 | |
5700 | if (VerifyStackAtCalls) { // Majik cookie to verify stack depth |
5701 | framesize -= wordSize; |
5702 | movptr(Address(rsp, framesize), (int32_t)0xbadb100d); |
5703 | } |
5704 | |
5705 | #ifndef _LP64 |
5706 | // If method sets FPU control word do it now |
5707 | if (fp_mode_24b) { |
5708 | fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); |
5709 | } |
5710 | if (UseSSE >= 2 && VerifyFPU) { |
5711 | verify_FPU(0, "FPU stack must be clean on entry" ); |
5712 | } |
5713 | #endif |
5714 | |
5715 | #ifdef ASSERT |
5716 | if (VerifyStackAtCalls) { |
5717 | Label L; |
5718 | push(rax); |
5719 | mov(rax, rsp); |
5720 | andptr(rax, StackAlignmentInBytes-1); |
5721 | cmpptr(rax, StackAlignmentInBytes-wordSize); |
5722 | pop(rax); |
5723 | jcc(Assembler::equal, L); |
5724 | STOP("Stack is not properly aligned!" ); |
5725 | bind(L); |
5726 | } |
5727 | #endif |
5728 | |
5729 | if (!is_stub) { |
5730 | BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
5731 | bs->nmethod_entry_barrier(this); |
5732 | } |
5733 | } |
5734 | |
5735 | // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers |
5736 | void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) { |
5737 | // cnt - number of qwords (8-byte words). |
5738 | // base - start address, qword aligned. |
5739 | Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; |
5740 | if (UseAVX >= 2) { |
5741 | vpxor(xtmp, xtmp, xtmp, AVX_256bit); |
5742 | } else { |
5743 | pxor(xtmp, xtmp); |
5744 | } |
5745 | jmp(L_zero_64_bytes); |
5746 | |
5747 | BIND(L_loop); |
5748 | if (UseAVX >= 2) { |
5749 | vmovdqu(Address(base, 0), xtmp); |
5750 | vmovdqu(Address(base, 32), xtmp); |
5751 | } else { |
5752 | movdqu(Address(base, 0), xtmp); |
5753 | movdqu(Address(base, 16), xtmp); |
5754 | movdqu(Address(base, 32), xtmp); |
5755 | movdqu(Address(base, 48), xtmp); |
5756 | } |
5757 | addptr(base, 64); |
5758 | |
5759 | BIND(L_zero_64_bytes); |
5760 | subptr(cnt, 8); |
5761 | jccb(Assembler::greaterEqual, L_loop); |
5762 | addptr(cnt, 4); |
5763 | jccb(Assembler::less, L_tail); |
5764 | // Copy trailing 32 bytes |
5765 | if (UseAVX >= 2) { |
5766 | vmovdqu(Address(base, 0), xtmp); |
5767 | } else { |
5768 | movdqu(Address(base, 0), xtmp); |
5769 | movdqu(Address(base, 16), xtmp); |
5770 | } |
5771 | addptr(base, 32); |
5772 | subptr(cnt, 4); |
5773 | |
5774 | BIND(L_tail); |
5775 | addptr(cnt, 4); |
5776 | jccb(Assembler::lessEqual, L_end); |
5777 | decrement(cnt); |
5778 | |
5779 | BIND(L_sloop); |
5780 | movq(Address(base, 0), xtmp); |
5781 | addptr(base, 8); |
5782 | decrement(cnt); |
5783 | jccb(Assembler::greaterEqual, L_sloop); |
5784 | BIND(L_end); |
5785 | } |
5786 | |
5787 | void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) { |
5788 | // cnt - number of qwords (8-byte words). |
5789 | // base - start address, qword aligned. |
5790 | // is_large - if optimizers know cnt is larger than InitArrayShortSize |
5791 | assert(base==rdi, "base register must be edi for rep stos" ); |
5792 | assert(tmp==rax, "tmp register must be eax for rep stos" ); |
5793 | assert(cnt==rcx, "cnt register must be ecx for rep stos" ); |
5794 | assert(InitArrayShortSize % BytesPerLong == 0, |
5795 | "InitArrayShortSize should be the multiple of BytesPerLong" ); |
5796 | |
5797 | Label DONE; |
5798 | |
5799 | if (!is_large || !UseXMMForObjInit) { |
5800 | xorptr(tmp, tmp); |
5801 | } |
5802 | |
5803 | if (!is_large) { |
5804 | Label LOOP, LONG; |
5805 | cmpptr(cnt, InitArrayShortSize/BytesPerLong); |
5806 | jccb(Assembler::greater, LONG); |
5807 | |
5808 | NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM |
5809 | |
5810 | decrement(cnt); |
5811 | jccb(Assembler::negative, DONE); // Zero length |
5812 | |
5813 | // Use individual pointer-sized stores for small counts: |
5814 | BIND(LOOP); |
5815 | movptr(Address(base, cnt, Address::times_ptr), tmp); |
5816 | decrement(cnt); |
5817 | jccb(Assembler::greaterEqual, LOOP); |
5818 | jmpb(DONE); |
5819 | |
5820 | BIND(LONG); |
5821 | } |
5822 | |
5823 | // Use longer rep-prefixed ops for non-small counts: |
5824 | if (UseFastStosb) { |
5825 | shlptr(cnt, 3); // convert to number of bytes |
5826 | rep_stosb(); |
5827 | } else if (UseXMMForObjInit) { |
5828 | movptr(tmp, base); |
5829 | xmm_clear_mem(tmp, cnt, xtmp); |
5830 | } else { |
5831 | NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM |
5832 | rep_stos(); |
5833 | } |
5834 | |
5835 | BIND(DONE); |
5836 | } |
5837 | |
5838 | #ifdef COMPILER2 |
5839 | |
5840 | // IndexOf for constant substrings with size >= 8 chars |
5841 | // which don't need to be loaded through stack. |
5842 | void MacroAssembler::string_indexofC8(Register str1, Register str2, |
5843 | Register cnt1, Register cnt2, |
5844 | int int_cnt2, Register result, |
5845 | XMMRegister vec, Register tmp, |
5846 | int ae) { |
5847 | ShortBranchVerifier sbv(this); |
5848 | assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required" ); |
5849 | assert(ae != StrIntrinsicNode::LU, "Invalid encoding" ); |
5850 | |
5851 | // This method uses the pcmpestri instruction with bound registers |
5852 | // inputs: |
5853 | // xmm - substring |
5854 | // rax - substring length (elements count) |
5855 | // mem - scanned string |
5856 | // rdx - string length (elements count) |
5857 | // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) |
5858 | // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) |
5859 | // outputs: |
5860 | // rcx - matched index in string |
5861 | assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri" ); |
5862 | int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts |
5863 | int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 |
5864 | Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; |
5865 | Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; |
5866 | |
5867 | Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, |
5868 | RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, |
5869 | MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; |
5870 | |
5871 | // Note, inline_string_indexOf() generates checks: |
5872 | // if (substr.count > string.count) return -1; |
5873 | // if (substr.count == 0) return 0; |
5874 | assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars" ); |
5875 | |
5876 | // Load substring. |
5877 | if (ae == StrIntrinsicNode::UL) { |
5878 | pmovzxbw(vec, Address(str2, 0)); |
5879 | } else { |
5880 | movdqu(vec, Address(str2, 0)); |
5881 | } |
5882 | movl(cnt2, int_cnt2); |
5883 | movptr(result, str1); // string addr |
5884 | |
5885 | if (int_cnt2 > stride) { |
5886 | jmpb(SCAN_TO_SUBSTR); |
5887 | |
5888 | // Reload substr for rescan, this code |
5889 | // is executed only for large substrings (> 8 chars) |
5890 | bind(RELOAD_SUBSTR); |
5891 | if (ae == StrIntrinsicNode::UL) { |
5892 | pmovzxbw(vec, Address(str2, 0)); |
5893 | } else { |
5894 | movdqu(vec, Address(str2, 0)); |
5895 | } |
5896 | negptr(cnt2); // Jumped here with negative cnt2, convert to positive |
5897 | |
5898 | bind(RELOAD_STR); |
5899 | // We came here after the beginning of the substring was |
5900 | // matched but the rest of it was not so we need to search |
5901 | // again. Start from the next element after the previous match. |
5902 | |
5903 | // cnt2 is number of substring reminding elements and |
5904 | // cnt1 is number of string reminding elements when cmp failed. |
5905 | // Restored cnt1 = cnt1 - cnt2 + int_cnt2 |
5906 | subl(cnt1, cnt2); |
5907 | addl(cnt1, int_cnt2); |
5908 | movl(cnt2, int_cnt2); // Now restore cnt2 |
5909 | |
5910 | decrementl(cnt1); // Shift to next element |
5911 | cmpl(cnt1, cnt2); |
5912 | jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring |
5913 | |
5914 | addptr(result, (1<<scale1)); |
5915 | |
5916 | } // (int_cnt2 > 8) |
5917 | |
5918 | // Scan string for start of substr in 16-byte vectors |
5919 | bind(SCAN_TO_SUBSTR); |
5920 | pcmpestri(vec, Address(result, 0), mode); |
5921 | jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 |
5922 | subl(cnt1, stride); |
5923 | jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string |
5924 | cmpl(cnt1, cnt2); |
5925 | jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring |
5926 | addptr(result, 16); |
5927 | jmpb(SCAN_TO_SUBSTR); |
5928 | |
5929 | // Found a potential substr |
5930 | bind(FOUND_CANDIDATE); |
5931 | // Matched whole vector if first element matched (tmp(rcx) == 0). |
5932 | if (int_cnt2 == stride) { |
5933 | jccb(Assembler::overflow, RET_FOUND); // OF == 1 |
5934 | } else { // int_cnt2 > 8 |
5935 | jccb(Assembler::overflow, FOUND_SUBSTR); |
5936 | } |
5937 | // After pcmpestri tmp(rcx) contains matched element index |
5938 | // Compute start addr of substr |
5939 | lea(result, Address(result, tmp, scale1)); |
5940 | |
5941 | // Make sure string is still long enough |
5942 | subl(cnt1, tmp); |
5943 | cmpl(cnt1, cnt2); |
5944 | if (int_cnt2 == stride) { |
5945 | jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); |
5946 | } else { // int_cnt2 > 8 |
5947 | jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); |
5948 | } |
5949 | // Left less then substring. |
5950 | |
5951 | bind(RET_NOT_FOUND); |
5952 | movl(result, -1); |
5953 | jmp(EXIT); |
5954 | |
5955 | if (int_cnt2 > stride) { |
5956 | // This code is optimized for the case when whole substring |
5957 | // is matched if its head is matched. |
5958 | bind(MATCH_SUBSTR_HEAD); |
5959 | pcmpestri(vec, Address(result, 0), mode); |
5960 | // Reload only string if does not match |
5961 | jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 |
5962 | |
5963 | Label CONT_SCAN_SUBSTR; |
5964 | // Compare the rest of substring (> 8 chars). |
5965 | bind(FOUND_SUBSTR); |
5966 | // First 8 chars are already matched. |
5967 | negptr(cnt2); |
5968 | addptr(cnt2, stride); |
5969 | |
5970 | bind(SCAN_SUBSTR); |
5971 | subl(cnt1, stride); |
5972 | cmpl(cnt2, -stride); // Do not read beyond substring |
5973 | jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); |
5974 | // Back-up strings to avoid reading beyond substring: |
5975 | // cnt1 = cnt1 - cnt2 + 8 |
5976 | addl(cnt1, cnt2); // cnt2 is negative |
5977 | addl(cnt1, stride); |
5978 | movl(cnt2, stride); negptr(cnt2); |
5979 | bind(CONT_SCAN_SUBSTR); |
5980 | if (int_cnt2 < (int)G) { |
5981 | int tail_off1 = int_cnt2<<scale1; |
5982 | int tail_off2 = int_cnt2<<scale2; |
5983 | if (ae == StrIntrinsicNode::UL) { |
5984 | pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); |
5985 | } else { |
5986 | movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); |
5987 | } |
5988 | pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); |
5989 | } else { |
5990 | // calculate index in register to avoid integer overflow (int_cnt2*2) |
5991 | movl(tmp, int_cnt2); |
5992 | addptr(tmp, cnt2); |
5993 | if (ae == StrIntrinsicNode::UL) { |
5994 | pmovzxbw(vec, Address(str2, tmp, scale2, 0)); |
5995 | } else { |
5996 | movdqu(vec, Address(str2, tmp, scale2, 0)); |
5997 | } |
5998 | pcmpestri(vec, Address(result, tmp, scale1, 0), mode); |
5999 | } |
6000 | // Need to reload strings pointers if not matched whole vector |
6001 | jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 |
6002 | addptr(cnt2, stride); |
6003 | jcc(Assembler::negative, SCAN_SUBSTR); |
6004 | // Fall through if found full substring |
6005 | |
6006 | } // (int_cnt2 > 8) |
6007 | |
6008 | bind(RET_FOUND); |
6009 | // Found result if we matched full small substring. |
6010 | // Compute substr offset |
6011 | subptr(result, str1); |
6012 | if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { |
6013 | shrl(result, 1); // index |
6014 | } |
6015 | bind(EXIT); |
6016 | |
6017 | } // string_indexofC8 |
6018 | |
6019 | // Small strings are loaded through stack if they cross page boundary. |
6020 | void MacroAssembler::string_indexof(Register str1, Register str2, |
6021 | Register cnt1, Register cnt2, |
6022 | int int_cnt2, Register result, |
6023 | XMMRegister vec, Register tmp, |
6024 | int ae) { |
6025 | ShortBranchVerifier sbv(this); |
6026 | assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required" ); |
6027 | assert(ae != StrIntrinsicNode::LU, "Invalid encoding" ); |
6028 | |
6029 | // |
6030 | // int_cnt2 is length of small (< 8 chars) constant substring |
6031 | // or (-1) for non constant substring in which case its length |
6032 | // is in cnt2 register. |
6033 | // |
6034 | // Note, inline_string_indexOf() generates checks: |
6035 | // if (substr.count > string.count) return -1; |
6036 | // if (substr.count == 0) return 0; |
6037 | // |
6038 | int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 |
6039 | assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0" ); |
6040 | // This method uses the pcmpestri instruction with bound registers |
6041 | // inputs: |
6042 | // xmm - substring |
6043 | // rax - substring length (elements count) |
6044 | // mem - scanned string |
6045 | // rdx - string length (elements count) |
6046 | // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) |
6047 | // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) |
6048 | // outputs: |
6049 | // rcx - matched index in string |
6050 | assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri" ); |
6051 | int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts |
6052 | Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; |
6053 | Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; |
6054 | |
6055 | Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, |
6056 | RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, |
6057 | FOUND_CANDIDATE; |
6058 | |
6059 | { //======================================================== |
6060 | // We don't know where these strings are located |
6061 | // and we can't read beyond them. Load them through stack. |
6062 | Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; |
6063 | |
6064 | movptr(tmp, rsp); // save old SP |
6065 | |
6066 | if (int_cnt2 > 0) { // small (< 8 chars) constant substring |
6067 | if (int_cnt2 == (1>>scale2)) { // One byte |
6068 | assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding" ); |
6069 | load_unsigned_byte(result, Address(str2, 0)); |
6070 | movdl(vec, result); // move 32 bits |
6071 | } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes |
6072 | // Not enough header space in 32-bit VM: 12+3 = 15. |
6073 | movl(result, Address(str2, -1)); |
6074 | shrl(result, 8); |
6075 | movdl(vec, result); // move 32 bits |
6076 | } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char |
6077 | load_unsigned_short(result, Address(str2, 0)); |
6078 | movdl(vec, result); // move 32 bits |
6079 | } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars |
6080 | movdl(vec, Address(str2, 0)); // move 32 bits |
6081 | } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars |
6082 | movq(vec, Address(str2, 0)); // move 64 bits |
6083 | } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) |
6084 | // Array header size is 12 bytes in 32-bit VM |
6085 | // + 6 bytes for 3 chars == 18 bytes, |
6086 | // enough space to load vec and shift. |
6087 | assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity" ); |
6088 | if (ae == StrIntrinsicNode::UL) { |
6089 | int tail_off = int_cnt2-8; |
6090 | pmovzxbw(vec, Address(str2, tail_off)); |
6091 | psrldq(vec, -2*tail_off); |
6092 | } |
6093 | else { |
6094 | int tail_off = int_cnt2*(1<<scale2); |
6095 | movdqu(vec, Address(str2, tail_off-16)); |
6096 | psrldq(vec, 16-tail_off); |
6097 | } |
6098 | } |
6099 | } else { // not constant substring |
6100 | cmpl(cnt2, stride); |
6101 | jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough |
6102 | |
6103 | // We can read beyond string if srt+16 does not cross page boundary |
6104 | // since heaps are aligned and mapped by pages. |
6105 | assert(os::vm_page_size() < (int)G, "default page should be small" ); |
6106 | movl(result, str2); // We need only low 32 bits |
6107 | andl(result, (os::vm_page_size()-1)); |
6108 | cmpl(result, (os::vm_page_size()-16)); |
6109 | jccb(Assembler::belowEqual, CHECK_STR); |
6110 | |
6111 | // Move small strings to stack to allow load 16 bytes into vec. |
6112 | subptr(rsp, 16); |
6113 | int stk_offset = wordSize-(1<<scale2); |
6114 | push(cnt2); |
6115 | |
6116 | bind(COPY_SUBSTR); |
6117 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { |
6118 | load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); |
6119 | movb(Address(rsp, cnt2, scale2, stk_offset), result); |
6120 | } else if (ae == StrIntrinsicNode::UU) { |
6121 | load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); |
6122 | movw(Address(rsp, cnt2, scale2, stk_offset), result); |
6123 | } |
6124 | decrement(cnt2); |
6125 | jccb(Assembler::notZero, COPY_SUBSTR); |
6126 | |
6127 | pop(cnt2); |
6128 | movptr(str2, rsp); // New substring address |
6129 | } // non constant |
6130 | |
6131 | bind(CHECK_STR); |
6132 | cmpl(cnt1, stride); |
6133 | jccb(Assembler::aboveEqual, BIG_STRINGS); |
6134 | |
6135 | // Check cross page boundary. |
6136 | movl(result, str1); // We need only low 32 bits |
6137 | andl(result, (os::vm_page_size()-1)); |
6138 | cmpl(result, (os::vm_page_size()-16)); |
6139 | jccb(Assembler::belowEqual, BIG_STRINGS); |
6140 | |
6141 | subptr(rsp, 16); |
6142 | int stk_offset = -(1<<scale1); |
6143 | if (int_cnt2 < 0) { // not constant |
6144 | push(cnt2); |
6145 | stk_offset += wordSize; |
6146 | } |
6147 | movl(cnt2, cnt1); |
6148 | |
6149 | bind(COPY_STR); |
6150 | if (ae == StrIntrinsicNode::LL) { |
6151 | load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); |
6152 | movb(Address(rsp, cnt2, scale1, stk_offset), result); |
6153 | } else { |
6154 | load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); |
6155 | movw(Address(rsp, cnt2, scale1, stk_offset), result); |
6156 | } |
6157 | decrement(cnt2); |
6158 | jccb(Assembler::notZero, COPY_STR); |
6159 | |
6160 | if (int_cnt2 < 0) { // not constant |
6161 | pop(cnt2); |
6162 | } |
6163 | movptr(str1, rsp); // New string address |
6164 | |
6165 | bind(BIG_STRINGS); |
6166 | // Load substring. |
6167 | if (int_cnt2 < 0) { // -1 |
6168 | if (ae == StrIntrinsicNode::UL) { |
6169 | pmovzxbw(vec, Address(str2, 0)); |
6170 | } else { |
6171 | movdqu(vec, Address(str2, 0)); |
6172 | } |
6173 | push(cnt2); // substr count |
6174 | push(str2); // substr addr |
6175 | push(str1); // string addr |
6176 | } else { |
6177 | // Small (< 8 chars) constant substrings are loaded already. |
6178 | movl(cnt2, int_cnt2); |
6179 | } |
6180 | push(tmp); // original SP |
6181 | |
6182 | } // Finished loading |
6183 | |
6184 | //======================================================== |
6185 | // Start search |
6186 | // |
6187 | |
6188 | movptr(result, str1); // string addr |
6189 | |
6190 | if (int_cnt2 < 0) { // Only for non constant substring |
6191 | jmpb(SCAN_TO_SUBSTR); |
6192 | |
6193 | // SP saved at sp+0 |
6194 | // String saved at sp+1*wordSize |
6195 | // Substr saved at sp+2*wordSize |
6196 | // Substr count saved at sp+3*wordSize |
6197 | |
6198 | // Reload substr for rescan, this code |
6199 | // is executed only for large substrings (> 8 chars) |
6200 | bind(RELOAD_SUBSTR); |
6201 | movptr(str2, Address(rsp, 2*wordSize)); |
6202 | movl(cnt2, Address(rsp, 3*wordSize)); |
6203 | if (ae == StrIntrinsicNode::UL) { |
6204 | pmovzxbw(vec, Address(str2, 0)); |
6205 | } else { |
6206 | movdqu(vec, Address(str2, 0)); |
6207 | } |
6208 | // We came here after the beginning of the substring was |
6209 | // matched but the rest of it was not so we need to search |
6210 | // again. Start from the next element after the previous match. |
6211 | subptr(str1, result); // Restore counter |
6212 | if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { |
6213 | shrl(str1, 1); |
6214 | } |
6215 | addl(cnt1, str1); |
6216 | decrementl(cnt1); // Shift to next element |
6217 | cmpl(cnt1, cnt2); |
6218 | jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring |
6219 | |
6220 | addptr(result, (1<<scale1)); |
6221 | } // non constant |
6222 | |
6223 | // Scan string for start of substr in 16-byte vectors |
6224 | bind(SCAN_TO_SUBSTR); |
6225 | assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri" ); |
6226 | pcmpestri(vec, Address(result, 0), mode); |
6227 | jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 |
6228 | subl(cnt1, stride); |
6229 | jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string |
6230 | cmpl(cnt1, cnt2); |
6231 | jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring |
6232 | addptr(result, 16); |
6233 | |
6234 | bind(ADJUST_STR); |
6235 | cmpl(cnt1, stride); // Do not read beyond string |
6236 | jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); |
6237 | // Back-up string to avoid reading beyond string. |
6238 | lea(result, Address(result, cnt1, scale1, -16)); |
6239 | movl(cnt1, stride); |
6240 | jmpb(SCAN_TO_SUBSTR); |
6241 | |
6242 | // Found a potential substr |
6243 | bind(FOUND_CANDIDATE); |
6244 | // After pcmpestri tmp(rcx) contains matched element index |
6245 | |
6246 | // Make sure string is still long enough |
6247 | subl(cnt1, tmp); |
6248 | cmpl(cnt1, cnt2); |
6249 | jccb(Assembler::greaterEqual, FOUND_SUBSTR); |
6250 | // Left less then substring. |
6251 | |
6252 | bind(RET_NOT_FOUND); |
6253 | movl(result, -1); |
6254 | jmp(CLEANUP); |
6255 | |
6256 | bind(FOUND_SUBSTR); |
6257 | // Compute start addr of substr |
6258 | lea(result, Address(result, tmp, scale1)); |
6259 | if (int_cnt2 > 0) { // Constant substring |
6260 | // Repeat search for small substring (< 8 chars) |
6261 | // from new point without reloading substring. |
6262 | // Have to check that we don't read beyond string. |
6263 | cmpl(tmp, stride-int_cnt2); |
6264 | jccb(Assembler::greater, ADJUST_STR); |
6265 | // Fall through if matched whole substring. |
6266 | } else { // non constant |
6267 | assert(int_cnt2 == -1, "should be != 0" ); |
6268 | |
6269 | addl(tmp, cnt2); |
6270 | // Found result if we matched whole substring. |
6271 | cmpl(tmp, stride); |
6272 | jcc(Assembler::lessEqual, RET_FOUND); |
6273 | |
6274 | // Repeat search for small substring (<= 8 chars) |
6275 | // from new point 'str1' without reloading substring. |
6276 | cmpl(cnt2, stride); |
6277 | // Have to check that we don't read beyond string. |
6278 | jccb(Assembler::lessEqual, ADJUST_STR); |
6279 | |
6280 | Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; |
6281 | // Compare the rest of substring (> 8 chars). |
6282 | movptr(str1, result); |
6283 | |
6284 | cmpl(tmp, cnt2); |
6285 | // First 8 chars are already matched. |
6286 | jccb(Assembler::equal, CHECK_NEXT); |
6287 | |
6288 | bind(SCAN_SUBSTR); |
6289 | pcmpestri(vec, Address(str1, 0), mode); |
6290 | // Need to reload strings pointers if not matched whole vector |
6291 | jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 |
6292 | |
6293 | bind(CHECK_NEXT); |
6294 | subl(cnt2, stride); |
6295 | jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring |
6296 | addptr(str1, 16); |
6297 | if (ae == StrIntrinsicNode::UL) { |
6298 | addptr(str2, 8); |
6299 | } else { |
6300 | addptr(str2, 16); |
6301 | } |
6302 | subl(cnt1, stride); |
6303 | cmpl(cnt2, stride); // Do not read beyond substring |
6304 | jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); |
6305 | // Back-up strings to avoid reading beyond substring. |
6306 | |
6307 | if (ae == StrIntrinsicNode::UL) { |
6308 | lea(str2, Address(str2, cnt2, scale2, -8)); |
6309 | lea(str1, Address(str1, cnt2, scale1, -16)); |
6310 | } else { |
6311 | lea(str2, Address(str2, cnt2, scale2, -16)); |
6312 | lea(str1, Address(str1, cnt2, scale1, -16)); |
6313 | } |
6314 | subl(cnt1, cnt2); |
6315 | movl(cnt2, stride); |
6316 | addl(cnt1, stride); |
6317 | bind(CONT_SCAN_SUBSTR); |
6318 | if (ae == StrIntrinsicNode::UL) { |
6319 | pmovzxbw(vec, Address(str2, 0)); |
6320 | } else { |
6321 | movdqu(vec, Address(str2, 0)); |
6322 | } |
6323 | jmp(SCAN_SUBSTR); |
6324 | |
6325 | bind(RET_FOUND_LONG); |
6326 | movptr(str1, Address(rsp, wordSize)); |
6327 | } // non constant |
6328 | |
6329 | bind(RET_FOUND); |
6330 | // Compute substr offset |
6331 | subptr(result, str1); |
6332 | if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { |
6333 | shrl(result, 1); // index |
6334 | } |
6335 | bind(CLEANUP); |
6336 | pop(rsp); // restore SP |
6337 | |
6338 | } // string_indexof |
6339 | |
6340 | void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, |
6341 | XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { |
6342 | ShortBranchVerifier sbv(this); |
6343 | assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required" ); |
6344 | |
6345 | int stride = 8; |
6346 | |
6347 | Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, |
6348 | SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, |
6349 | RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, |
6350 | FOUND_SEQ_CHAR, DONE_LABEL; |
6351 | |
6352 | movptr(result, str1); |
6353 | if (UseAVX >= 2) { |
6354 | cmpl(cnt1, stride); |
6355 | jcc(Assembler::less, SCAN_TO_CHAR_LOOP); |
6356 | cmpl(cnt1, 2*stride); |
6357 | jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); |
6358 | movdl(vec1, ch); |
6359 | vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); |
6360 | vpxor(vec2, vec2); |
6361 | movl(tmp, cnt1); |
6362 | andl(tmp, 0xFFFFFFF0); //vector count (in chars) |
6363 | andl(cnt1,0x0000000F); //tail count (in chars) |
6364 | |
6365 | bind(SCAN_TO_16_CHAR_LOOP); |
6366 | vmovdqu(vec3, Address(result, 0)); |
6367 | vpcmpeqw(vec3, vec3, vec1, 1); |
6368 | vptest(vec2, vec3); |
6369 | jcc(Assembler::carryClear, FOUND_CHAR); |
6370 | addptr(result, 32); |
6371 | subl(tmp, 2*stride); |
6372 | jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); |
6373 | jmp(SCAN_TO_8_CHAR); |
6374 | bind(SCAN_TO_8_CHAR_INIT); |
6375 | movdl(vec1, ch); |
6376 | pshuflw(vec1, vec1, 0x00); |
6377 | pshufd(vec1, vec1, 0); |
6378 | pxor(vec2, vec2); |
6379 | } |
6380 | bind(SCAN_TO_8_CHAR); |
6381 | cmpl(cnt1, stride); |
6382 | if (UseAVX >= 2) { |
6383 | jcc(Assembler::less, SCAN_TO_CHAR); |
6384 | } else { |
6385 | jcc(Assembler::less, SCAN_TO_CHAR_LOOP); |
6386 | movdl(vec1, ch); |
6387 | pshuflw(vec1, vec1, 0x00); |
6388 | pshufd(vec1, vec1, 0); |
6389 | pxor(vec2, vec2); |
6390 | } |
6391 | movl(tmp, cnt1); |
6392 | andl(tmp, 0xFFFFFFF8); //vector count (in chars) |
6393 | andl(cnt1,0x00000007); //tail count (in chars) |
6394 | |
6395 | bind(SCAN_TO_8_CHAR_LOOP); |
6396 | movdqu(vec3, Address(result, 0)); |
6397 | pcmpeqw(vec3, vec1); |
6398 | ptest(vec2, vec3); |
6399 | jcc(Assembler::carryClear, FOUND_CHAR); |
6400 | addptr(result, 16); |
6401 | subl(tmp, stride); |
6402 | jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); |
6403 | bind(SCAN_TO_CHAR); |
6404 | testl(cnt1, cnt1); |
6405 | jcc(Assembler::zero, RET_NOT_FOUND); |
6406 | bind(SCAN_TO_CHAR_LOOP); |
6407 | load_unsigned_short(tmp, Address(result, 0)); |
6408 | cmpl(ch, tmp); |
6409 | jccb(Assembler::equal, FOUND_SEQ_CHAR); |
6410 | addptr(result, 2); |
6411 | subl(cnt1, 1); |
6412 | jccb(Assembler::zero, RET_NOT_FOUND); |
6413 | jmp(SCAN_TO_CHAR_LOOP); |
6414 | |
6415 | bind(RET_NOT_FOUND); |
6416 | movl(result, -1); |
6417 | jmpb(DONE_LABEL); |
6418 | |
6419 | bind(FOUND_CHAR); |
6420 | if (UseAVX >= 2) { |
6421 | vpmovmskb(tmp, vec3); |
6422 | } else { |
6423 | pmovmskb(tmp, vec3); |
6424 | } |
6425 | bsfl(ch, tmp); |
6426 | addl(result, ch); |
6427 | |
6428 | bind(FOUND_SEQ_CHAR); |
6429 | subptr(result, str1); |
6430 | shrl(result, 1); |
6431 | |
6432 | bind(DONE_LABEL); |
6433 | } // string_indexof_char |
6434 | |
6435 | // helper function for string_compare |
6436 | void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, |
6437 | Address::ScaleFactor scale, Address::ScaleFactor scale1, |
6438 | Address::ScaleFactor scale2, Register index, int ae) { |
6439 | if (ae == StrIntrinsicNode::LL) { |
6440 | load_unsigned_byte(elem1, Address(str1, index, scale, 0)); |
6441 | load_unsigned_byte(elem2, Address(str2, index, scale, 0)); |
6442 | } else if (ae == StrIntrinsicNode::UU) { |
6443 | load_unsigned_short(elem1, Address(str1, index, scale, 0)); |
6444 | load_unsigned_short(elem2, Address(str2, index, scale, 0)); |
6445 | } else { |
6446 | load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); |
6447 | load_unsigned_short(elem2, Address(str2, index, scale2, 0)); |
6448 | } |
6449 | } |
6450 | |
6451 | // Compare strings, used for char[] and byte[]. |
6452 | void MacroAssembler::string_compare(Register str1, Register str2, |
6453 | Register cnt1, Register cnt2, Register result, |
6454 | XMMRegister vec1, int ae) { |
6455 | ShortBranchVerifier sbv(this); |
6456 | Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; |
6457 | Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 |
6458 | int stride, stride2, adr_stride, adr_stride1, adr_stride2; |
6459 | int stride2x2 = 0x40; |
6460 | Address::ScaleFactor scale = Address::no_scale; |
6461 | Address::ScaleFactor scale1 = Address::no_scale; |
6462 | Address::ScaleFactor scale2 = Address::no_scale; |
6463 | |
6464 | if (ae != StrIntrinsicNode::LL) { |
6465 | stride2x2 = 0x20; |
6466 | } |
6467 | |
6468 | if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { |
6469 | shrl(cnt2, 1); |
6470 | } |
6471 | // Compute the minimum of the string lengths and the |
6472 | // difference of the string lengths (stack). |
6473 | // Do the conditional move stuff |
6474 | movl(result, cnt1); |
6475 | subl(cnt1, cnt2); |
6476 | push(cnt1); |
6477 | cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) |
6478 | |
6479 | // Is the minimum length zero? |
6480 | testl(cnt2, cnt2); |
6481 | jcc(Assembler::zero, LENGTH_DIFF_LABEL); |
6482 | if (ae == StrIntrinsicNode::LL) { |
6483 | // Load first bytes |
6484 | load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] |
6485 | load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] |
6486 | } else if (ae == StrIntrinsicNode::UU) { |
6487 | // Load first characters |
6488 | load_unsigned_short(result, Address(str1, 0)); |
6489 | load_unsigned_short(cnt1, Address(str2, 0)); |
6490 | } else { |
6491 | load_unsigned_byte(result, Address(str1, 0)); |
6492 | load_unsigned_short(cnt1, Address(str2, 0)); |
6493 | } |
6494 | subl(result, cnt1); |
6495 | jcc(Assembler::notZero, POP_LABEL); |
6496 | |
6497 | if (ae == StrIntrinsicNode::UU) { |
6498 | // Divide length by 2 to get number of chars |
6499 | shrl(cnt2, 1); |
6500 | } |
6501 | cmpl(cnt2, 1); |
6502 | jcc(Assembler::equal, LENGTH_DIFF_LABEL); |
6503 | |
6504 | // Check if the strings start at the same location and setup scale and stride |
6505 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6506 | cmpptr(str1, str2); |
6507 | jcc(Assembler::equal, LENGTH_DIFF_LABEL); |
6508 | if (ae == StrIntrinsicNode::LL) { |
6509 | scale = Address::times_1; |
6510 | stride = 16; |
6511 | } else { |
6512 | scale = Address::times_2; |
6513 | stride = 8; |
6514 | } |
6515 | } else { |
6516 | scale1 = Address::times_1; |
6517 | scale2 = Address::times_2; |
6518 | // scale not used |
6519 | stride = 8; |
6520 | } |
6521 | |
6522 | if (UseAVX >= 2 && UseSSE42Intrinsics) { |
6523 | Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; |
6524 | Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; |
6525 | Label COMPARE_WIDE_VECTORS_LOOP_AVX2; |
6526 | Label COMPARE_TAIL_LONG; |
6527 | Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 |
6528 | |
6529 | int pcmpmask = 0x19; |
6530 | if (ae == StrIntrinsicNode::LL) { |
6531 | pcmpmask &= ~0x01; |
6532 | } |
6533 | |
6534 | // Setup to compare 16-chars (32-bytes) vectors, |
6535 | // start from first character again because it has aligned address. |
6536 | if (ae == StrIntrinsicNode::LL) { |
6537 | stride2 = 32; |
6538 | } else { |
6539 | stride2 = 16; |
6540 | } |
6541 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6542 | adr_stride = stride << scale; |
6543 | } else { |
6544 | adr_stride1 = 8; //stride << scale1; |
6545 | adr_stride2 = 16; //stride << scale2; |
6546 | } |
6547 | |
6548 | assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri" ); |
6549 | // rax and rdx are used by pcmpestri as elements counters |
6550 | movl(result, cnt2); |
6551 | andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count |
6552 | jcc(Assembler::zero, COMPARE_TAIL_LONG); |
6553 | |
6554 | // fast path : compare first 2 8-char vectors. |
6555 | bind(COMPARE_16_CHARS); |
6556 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6557 | movdqu(vec1, Address(str1, 0)); |
6558 | } else { |
6559 | pmovzxbw(vec1, Address(str1, 0)); |
6560 | } |
6561 | pcmpestri(vec1, Address(str2, 0), pcmpmask); |
6562 | jccb(Assembler::below, COMPARE_INDEX_CHAR); |
6563 | |
6564 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6565 | movdqu(vec1, Address(str1, adr_stride)); |
6566 | pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); |
6567 | } else { |
6568 | pmovzxbw(vec1, Address(str1, adr_stride1)); |
6569 | pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); |
6570 | } |
6571 | jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); |
6572 | addl(cnt1, stride); |
6573 | |
6574 | // Compare the characters at index in cnt1 |
6575 | bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character |
6576 | load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); |
6577 | subl(result, cnt2); |
6578 | jmp(POP_LABEL); |
6579 | |
6580 | // Setup the registers to start vector comparison loop |
6581 | bind(COMPARE_WIDE_VECTORS); |
6582 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6583 | lea(str1, Address(str1, result, scale)); |
6584 | lea(str2, Address(str2, result, scale)); |
6585 | } else { |
6586 | lea(str1, Address(str1, result, scale1)); |
6587 | lea(str2, Address(str2, result, scale2)); |
6588 | } |
6589 | subl(result, stride2); |
6590 | subl(cnt2, stride2); |
6591 | jcc(Assembler::zero, COMPARE_WIDE_TAIL); |
6592 | negptr(result); |
6593 | |
6594 | // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) |
6595 | bind(COMPARE_WIDE_VECTORS_LOOP); |
6596 | |
6597 | #ifdef _LP64 |
6598 | if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
6599 | cmpl(cnt2, stride2x2); |
6600 | jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
6601 | testl(cnt2, stride2x2-1); // cnt2 holds the vector count |
6602 | jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 |
6603 | |
6604 | bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop |
6605 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6606 | evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); |
6607 | evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 |
6608 | } else { |
6609 | vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); |
6610 | evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 |
6611 | } |
6612 | kortestql(k7, k7); |
6613 | jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare |
6614 | addptr(result, stride2x2); // update since we already compared at this addr |
6615 | subl(cnt2, stride2x2); // and sub the size too |
6616 | jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); |
6617 | |
6618 | vpxor(vec1, vec1); |
6619 | jmpb(COMPARE_WIDE_TAIL); |
6620 | }//if (VM_Version::supports_avx512vlbw()) |
6621 | #endif // _LP64 |
6622 | |
6623 | |
6624 | bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); |
6625 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6626 | vmovdqu(vec1, Address(str1, result, scale)); |
6627 | vpxor(vec1, Address(str2, result, scale)); |
6628 | } else { |
6629 | vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); |
6630 | vpxor(vec1, Address(str2, result, scale2)); |
6631 | } |
6632 | vptest(vec1, vec1); |
6633 | jcc(Assembler::notZero, VECTOR_NOT_EQUAL); |
6634 | addptr(result, stride2); |
6635 | subl(cnt2, stride2); |
6636 | jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); |
6637 | // clean upper bits of YMM registers |
6638 | vpxor(vec1, vec1); |
6639 | |
6640 | // compare wide vectors tail |
6641 | bind(COMPARE_WIDE_TAIL); |
6642 | testptr(result, result); |
6643 | jcc(Assembler::zero, LENGTH_DIFF_LABEL); |
6644 | |
6645 | movl(result, stride2); |
6646 | movl(cnt2, result); |
6647 | negptr(result); |
6648 | jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); |
6649 | |
6650 | // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. |
6651 | bind(VECTOR_NOT_EQUAL); |
6652 | // clean upper bits of YMM registers |
6653 | vpxor(vec1, vec1); |
6654 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6655 | lea(str1, Address(str1, result, scale)); |
6656 | lea(str2, Address(str2, result, scale)); |
6657 | } else { |
6658 | lea(str1, Address(str1, result, scale1)); |
6659 | lea(str2, Address(str2, result, scale2)); |
6660 | } |
6661 | jmp(COMPARE_16_CHARS); |
6662 | |
6663 | // Compare tail chars, length between 1 to 15 chars |
6664 | bind(COMPARE_TAIL_LONG); |
6665 | movl(cnt2, result); |
6666 | cmpl(cnt2, stride); |
6667 | jcc(Assembler::less, COMPARE_SMALL_STR); |
6668 | |
6669 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6670 | movdqu(vec1, Address(str1, 0)); |
6671 | } else { |
6672 | pmovzxbw(vec1, Address(str1, 0)); |
6673 | } |
6674 | pcmpestri(vec1, Address(str2, 0), pcmpmask); |
6675 | jcc(Assembler::below, COMPARE_INDEX_CHAR); |
6676 | subptr(cnt2, stride); |
6677 | jcc(Assembler::zero, LENGTH_DIFF_LABEL); |
6678 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6679 | lea(str1, Address(str1, result, scale)); |
6680 | lea(str2, Address(str2, result, scale)); |
6681 | } else { |
6682 | lea(str1, Address(str1, result, scale1)); |
6683 | lea(str2, Address(str2, result, scale2)); |
6684 | } |
6685 | negptr(cnt2); |
6686 | jmpb(WHILE_HEAD_LABEL); |
6687 | |
6688 | bind(COMPARE_SMALL_STR); |
6689 | } else if (UseSSE42Intrinsics) { |
6690 | Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; |
6691 | int pcmpmask = 0x19; |
6692 | // Setup to compare 8-char (16-byte) vectors, |
6693 | // start from first character again because it has aligned address. |
6694 | movl(result, cnt2); |
6695 | andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count |
6696 | if (ae == StrIntrinsicNode::LL) { |
6697 | pcmpmask &= ~0x01; |
6698 | } |
6699 | jcc(Assembler::zero, COMPARE_TAIL); |
6700 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6701 | lea(str1, Address(str1, result, scale)); |
6702 | lea(str2, Address(str2, result, scale)); |
6703 | } else { |
6704 | lea(str1, Address(str1, result, scale1)); |
6705 | lea(str2, Address(str2, result, scale2)); |
6706 | } |
6707 | negptr(result); |
6708 | |
6709 | // pcmpestri |
6710 | // inputs: |
6711 | // vec1- substring |
6712 | // rax - negative string length (elements count) |
6713 | // mem - scanned string |
6714 | // rdx - string length (elements count) |
6715 | // pcmpmask - cmp mode: 11000 (string compare with negated result) |
6716 | // + 00 (unsigned bytes) or + 01 (unsigned shorts) |
6717 | // outputs: |
6718 | // rcx - first mismatched element index |
6719 | assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri" ); |
6720 | |
6721 | bind(COMPARE_WIDE_VECTORS); |
6722 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6723 | movdqu(vec1, Address(str1, result, scale)); |
6724 | pcmpestri(vec1, Address(str2, result, scale), pcmpmask); |
6725 | } else { |
6726 | pmovzxbw(vec1, Address(str1, result, scale1)); |
6727 | pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); |
6728 | } |
6729 | // After pcmpestri cnt1(rcx) contains mismatched element index |
6730 | |
6731 | jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 |
6732 | addptr(result, stride); |
6733 | subptr(cnt2, stride); |
6734 | jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); |
6735 | |
6736 | // compare wide vectors tail |
6737 | testptr(result, result); |
6738 | jcc(Assembler::zero, LENGTH_DIFF_LABEL); |
6739 | |
6740 | movl(cnt2, stride); |
6741 | movl(result, stride); |
6742 | negptr(result); |
6743 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6744 | movdqu(vec1, Address(str1, result, scale)); |
6745 | pcmpestri(vec1, Address(str2, result, scale), pcmpmask); |
6746 | } else { |
6747 | pmovzxbw(vec1, Address(str1, result, scale1)); |
6748 | pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); |
6749 | } |
6750 | jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); |
6751 | |
6752 | // Mismatched characters in the vectors |
6753 | bind(VECTOR_NOT_EQUAL); |
6754 | addptr(cnt1, result); |
6755 | load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); |
6756 | subl(result, cnt2); |
6757 | jmpb(POP_LABEL); |
6758 | |
6759 | bind(COMPARE_TAIL); // limit is zero |
6760 | movl(cnt2, result); |
6761 | // Fallthru to tail compare |
6762 | } |
6763 | // Shift str2 and str1 to the end of the arrays, negate min |
6764 | if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { |
6765 | lea(str1, Address(str1, cnt2, scale)); |
6766 | lea(str2, Address(str2, cnt2, scale)); |
6767 | } else { |
6768 | lea(str1, Address(str1, cnt2, scale1)); |
6769 | lea(str2, Address(str2, cnt2, scale2)); |
6770 | } |
6771 | decrementl(cnt2); // first character was compared already |
6772 | negptr(cnt2); |
6773 | |
6774 | // Compare the rest of the elements |
6775 | bind(WHILE_HEAD_LABEL); |
6776 | load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); |
6777 | subl(result, cnt1); |
6778 | jccb(Assembler::notZero, POP_LABEL); |
6779 | increment(cnt2); |
6780 | jccb(Assembler::notZero, WHILE_HEAD_LABEL); |
6781 | |
6782 | // Strings are equal up to min length. Return the length difference. |
6783 | bind(LENGTH_DIFF_LABEL); |
6784 | pop(result); |
6785 | if (ae == StrIntrinsicNode::UU) { |
6786 | // Divide diff by 2 to get number of chars |
6787 | sarl(result, 1); |
6788 | } |
6789 | jmpb(DONE_LABEL); |
6790 | |
6791 | #ifdef _LP64 |
6792 | if (VM_Version::supports_avx512vlbw()) { |
6793 | |
6794 | bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); |
6795 | |
6796 | kmovql(cnt1, k7); |
6797 | notq(cnt1); |
6798 | bsfq(cnt2, cnt1); |
6799 | if (ae != StrIntrinsicNode::LL) { |
6800 | // Divide diff by 2 to get number of chars |
6801 | sarl(cnt2, 1); |
6802 | } |
6803 | addq(result, cnt2); |
6804 | if (ae == StrIntrinsicNode::LL) { |
6805 | load_unsigned_byte(cnt1, Address(str2, result)); |
6806 | load_unsigned_byte(result, Address(str1, result)); |
6807 | } else if (ae == StrIntrinsicNode::UU) { |
6808 | load_unsigned_short(cnt1, Address(str2, result, scale)); |
6809 | load_unsigned_short(result, Address(str1, result, scale)); |
6810 | } else { |
6811 | load_unsigned_short(cnt1, Address(str2, result, scale2)); |
6812 | load_unsigned_byte(result, Address(str1, result, scale1)); |
6813 | } |
6814 | subl(result, cnt1); |
6815 | jmpb(POP_LABEL); |
6816 | }//if (VM_Version::supports_avx512vlbw()) |
6817 | #endif // _LP64 |
6818 | |
6819 | // Discard the stored length difference |
6820 | bind(POP_LABEL); |
6821 | pop(cnt1); |
6822 | |
6823 | // That's it |
6824 | bind(DONE_LABEL); |
6825 | if(ae == StrIntrinsicNode::UL) { |
6826 | negl(result); |
6827 | } |
6828 | |
6829 | } |
6830 | |
6831 | // Search for Non-ASCII character (Negative byte value) in a byte array, |
6832 | // return true if it has any and false otherwise. |
6833 | // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java |
6834 | // @HotSpotIntrinsicCandidate |
6835 | // private static boolean hasNegatives(byte[] ba, int off, int len) { |
6836 | // for (int i = off; i < off + len; i++) { |
6837 | // if (ba[i] < 0) { |
6838 | // return true; |
6839 | // } |
6840 | // } |
6841 | // return false; |
6842 | // } |
6843 | void MacroAssembler::has_negatives(Register ary1, Register len, |
6844 | Register result, Register tmp1, |
6845 | XMMRegister vec1, XMMRegister vec2) { |
6846 | // rsi: byte array |
6847 | // rcx: len |
6848 | // rax: result |
6849 | ShortBranchVerifier sbv(this); |
6850 | assert_different_registers(ary1, len, result, tmp1); |
6851 | assert_different_registers(vec1, vec2); |
6852 | Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; |
6853 | |
6854 | // len == 0 |
6855 | testl(len, len); |
6856 | jcc(Assembler::zero, FALSE_LABEL); |
6857 | |
6858 | if ((UseAVX > 2) && // AVX512 |
6859 | VM_Version::supports_avx512vlbw() && |
6860 | VM_Version::supports_bmi2()) { |
6861 | |
6862 | Label test_64_loop, test_tail; |
6863 | Register tmp3_aliased = len; |
6864 | |
6865 | movl(tmp1, len); |
6866 | vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); |
6867 | |
6868 | andl(tmp1, 64 - 1); // tail count (in chars) 0x3F |
6869 | andl(len, ~(64 - 1)); // vector count (in chars) |
6870 | jccb(Assembler::zero, test_tail); |
6871 | |
6872 | lea(ary1, Address(ary1, len, Address::times_1)); |
6873 | negptr(len); |
6874 | |
6875 | bind(test_64_loop); |
6876 | // Check whether our 64 elements of size byte contain negatives |
6877 | evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); |
6878 | kortestql(k2, k2); |
6879 | jcc(Assembler::notZero, TRUE_LABEL); |
6880 | |
6881 | addptr(len, 64); |
6882 | jccb(Assembler::notZero, test_64_loop); |
6883 | |
6884 | |
6885 | bind(test_tail); |
6886 | // bail out when there is nothing to be done |
6887 | testl(tmp1, -1); |
6888 | jcc(Assembler::zero, FALSE_LABEL); |
6889 | |
6890 | // ~(~0 << len) applied up to two times (for 32-bit scenario) |
6891 | #ifdef _LP64 |
6892 | mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); |
6893 | shlxq(tmp3_aliased, tmp3_aliased, tmp1); |
6894 | notq(tmp3_aliased); |
6895 | kmovql(k3, tmp3_aliased); |
6896 | #else |
6897 | Label k_init; |
6898 | jmp(k_init); |
6899 | |
6900 | // We could not read 64-bits from a general purpose register thus we move |
6901 | // data required to compose 64 1's to the instruction stream |
6902 | // We emit 64 byte wide series of elements from 0..63 which later on would |
6903 | // be used as a compare targets with tail count contained in tmp1 register. |
6904 | // Result would be a k register having tmp1 consecutive number or 1 |
6905 | // counting from least significant bit. |
6906 | address tmp = pc(); |
6907 | emit_int64(0x0706050403020100); |
6908 | emit_int64(0x0F0E0D0C0B0A0908); |
6909 | emit_int64(0x1716151413121110); |
6910 | emit_int64(0x1F1E1D1C1B1A1918); |
6911 | emit_int64(0x2726252423222120); |
6912 | emit_int64(0x2F2E2D2C2B2A2928); |
6913 | emit_int64(0x3736353433323130); |
6914 | emit_int64(0x3F3E3D3C3B3A3938); |
6915 | |
6916 | bind(k_init); |
6917 | lea(len, InternalAddress(tmp)); |
6918 | // create mask to test for negative byte inside a vector |
6919 | evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); |
6920 | evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); |
6921 | |
6922 | #endif |
6923 | evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); |
6924 | ktestq(k2, k3); |
6925 | jcc(Assembler::notZero, TRUE_LABEL); |
6926 | |
6927 | jmp(FALSE_LABEL); |
6928 | } else { |
6929 | movl(result, len); // copy |
6930 | |
6931 | if (UseAVX == 2 && UseSSE >= 2) { |
6932 | // With AVX2, use 32-byte vector compare |
6933 | Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
6934 | |
6935 | // Compare 32-byte vectors |
6936 | andl(result, 0x0000001f); // tail count (in bytes) |
6937 | andl(len, 0xffffffe0); // vector count (in bytes) |
6938 | jccb(Assembler::zero, COMPARE_TAIL); |
6939 | |
6940 | lea(ary1, Address(ary1, len, Address::times_1)); |
6941 | negptr(len); |
6942 | |
6943 | movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector |
6944 | movdl(vec2, tmp1); |
6945 | vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); |
6946 | |
6947 | bind(COMPARE_WIDE_VECTORS); |
6948 | vmovdqu(vec1, Address(ary1, len, Address::times_1)); |
6949 | vptest(vec1, vec2); |
6950 | jccb(Assembler::notZero, TRUE_LABEL); |
6951 | addptr(len, 32); |
6952 | jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); |
6953 | |
6954 | testl(result, result); |
6955 | jccb(Assembler::zero, FALSE_LABEL); |
6956 | |
6957 | vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); |
6958 | vptest(vec1, vec2); |
6959 | jccb(Assembler::notZero, TRUE_LABEL); |
6960 | jmpb(FALSE_LABEL); |
6961 | |
6962 | bind(COMPARE_TAIL); // len is zero |
6963 | movl(len, result); |
6964 | // Fallthru to tail compare |
6965 | } else if (UseSSE42Intrinsics) { |
6966 | // With SSE4.2, use double quad vector compare |
6967 | Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
6968 | |
6969 | // Compare 16-byte vectors |
6970 | andl(result, 0x0000000f); // tail count (in bytes) |
6971 | andl(len, 0xfffffff0); // vector count (in bytes) |
6972 | jcc(Assembler::zero, COMPARE_TAIL); |
6973 | |
6974 | lea(ary1, Address(ary1, len, Address::times_1)); |
6975 | negptr(len); |
6976 | |
6977 | movl(tmp1, 0x80808080); |
6978 | movdl(vec2, tmp1); |
6979 | pshufd(vec2, vec2, 0); |
6980 | |
6981 | bind(COMPARE_WIDE_VECTORS); |
6982 | movdqu(vec1, Address(ary1, len, Address::times_1)); |
6983 | ptest(vec1, vec2); |
6984 | jcc(Assembler::notZero, TRUE_LABEL); |
6985 | addptr(len, 16); |
6986 | jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); |
6987 | |
6988 | testl(result, result); |
6989 | jcc(Assembler::zero, FALSE_LABEL); |
6990 | |
6991 | movdqu(vec1, Address(ary1, result, Address::times_1, -16)); |
6992 | ptest(vec1, vec2); |
6993 | jccb(Assembler::notZero, TRUE_LABEL); |
6994 | jmpb(FALSE_LABEL); |
6995 | |
6996 | bind(COMPARE_TAIL); // len is zero |
6997 | movl(len, result); |
6998 | // Fallthru to tail compare |
6999 | } |
7000 | } |
7001 | // Compare 4-byte vectors |
7002 | andl(len, 0xfffffffc); // vector count (in bytes) |
7003 | jccb(Assembler::zero, COMPARE_CHAR); |
7004 | |
7005 | lea(ary1, Address(ary1, len, Address::times_1)); |
7006 | negptr(len); |
7007 | |
7008 | bind(COMPARE_VECTORS); |
7009 | movl(tmp1, Address(ary1, len, Address::times_1)); |
7010 | andl(tmp1, 0x80808080); |
7011 | jccb(Assembler::notZero, TRUE_LABEL); |
7012 | addptr(len, 4); |
7013 | jcc(Assembler::notZero, COMPARE_VECTORS); |
7014 | |
7015 | // Compare trailing char (final 2 bytes), if any |
7016 | bind(COMPARE_CHAR); |
7017 | testl(result, 0x2); // tail char |
7018 | jccb(Assembler::zero, COMPARE_BYTE); |
7019 | load_unsigned_short(tmp1, Address(ary1, 0)); |
7020 | andl(tmp1, 0x00008080); |
7021 | jccb(Assembler::notZero, TRUE_LABEL); |
7022 | subptr(result, 2); |
7023 | lea(ary1, Address(ary1, 2)); |
7024 | |
7025 | bind(COMPARE_BYTE); |
7026 | testl(result, 0x1); // tail byte |
7027 | jccb(Assembler::zero, FALSE_LABEL); |
7028 | load_unsigned_byte(tmp1, Address(ary1, 0)); |
7029 | andl(tmp1, 0x00000080); |
7030 | jccb(Assembler::notEqual, TRUE_LABEL); |
7031 | jmpb(FALSE_LABEL); |
7032 | |
7033 | bind(TRUE_LABEL); |
7034 | movl(result, 1); // return true |
7035 | jmpb(DONE); |
7036 | |
7037 | bind(FALSE_LABEL); |
7038 | xorl(result, result); // return false |
7039 | |
7040 | // That's it |
7041 | bind(DONE); |
7042 | if (UseAVX >= 2 && UseSSE >= 2) { |
7043 | // clean upper bits of YMM registers |
7044 | vpxor(vec1, vec1); |
7045 | vpxor(vec2, vec2); |
7046 | } |
7047 | } |
7048 | // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. |
7049 | void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, |
7050 | Register limit, Register result, Register chr, |
7051 | XMMRegister vec1, XMMRegister vec2, bool is_char) { |
7052 | ShortBranchVerifier sbv(this); |
7053 | Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; |
7054 | |
7055 | int length_offset = arrayOopDesc::length_offset_in_bytes(); |
7056 | int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); |
7057 | |
7058 | if (is_array_equ) { |
7059 | // Check the input args |
7060 | cmpoop(ary1, ary2); |
7061 | jcc(Assembler::equal, TRUE_LABEL); |
7062 | |
7063 | // Need additional checks for arrays_equals. |
7064 | testptr(ary1, ary1); |
7065 | jcc(Assembler::zero, FALSE_LABEL); |
7066 | testptr(ary2, ary2); |
7067 | jcc(Assembler::zero, FALSE_LABEL); |
7068 | |
7069 | // Check the lengths |
7070 | movl(limit, Address(ary1, length_offset)); |
7071 | cmpl(limit, Address(ary2, length_offset)); |
7072 | jcc(Assembler::notEqual, FALSE_LABEL); |
7073 | } |
7074 | |
7075 | // count == 0 |
7076 | testl(limit, limit); |
7077 | jcc(Assembler::zero, TRUE_LABEL); |
7078 | |
7079 | if (is_array_equ) { |
7080 | // Load array address |
7081 | lea(ary1, Address(ary1, base_offset)); |
7082 | lea(ary2, Address(ary2, base_offset)); |
7083 | } |
7084 | |
7085 | if (is_array_equ && is_char) { |
7086 | // arrays_equals when used for char[]. |
7087 | shll(limit, 1); // byte count != 0 |
7088 | } |
7089 | movl(result, limit); // copy |
7090 | |
7091 | if (UseAVX >= 2) { |
7092 | // With AVX2, use 32-byte vector compare |
7093 | Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
7094 | |
7095 | // Compare 32-byte vectors |
7096 | andl(result, 0x0000001f); // tail count (in bytes) |
7097 | andl(limit, 0xffffffe0); // vector count (in bytes) |
7098 | jcc(Assembler::zero, COMPARE_TAIL); |
7099 | |
7100 | lea(ary1, Address(ary1, limit, Address::times_1)); |
7101 | lea(ary2, Address(ary2, limit, Address::times_1)); |
7102 | negptr(limit); |
7103 | |
7104 | bind(COMPARE_WIDE_VECTORS); |
7105 | |
7106 | #ifdef _LP64 |
7107 | if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
7108 | Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; |
7109 | |
7110 | cmpl(limit, -64); |
7111 | jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7112 | |
7113 | bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop |
7114 | |
7115 | evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); |
7116 | evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); |
7117 | kortestql(k7, k7); |
7118 | jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare |
7119 | addptr(limit, 64); // update since we already compared at this addr |
7120 | cmpl(limit, -64); |
7121 | jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); |
7122 | |
7123 | // At this point we may still need to compare -limit+result bytes. |
7124 | // We could execute the next two instruction and just continue via non-wide path: |
7125 | // cmpl(limit, 0); |
7126 | // jcc(Assembler::equal, COMPARE_TAIL); // true |
7127 | // But since we stopped at the points ary{1,2}+limit which are |
7128 | // not farther than 64 bytes from the ends of arrays ary{1,2}+result |
7129 | // (|limit| <= 32 and result < 32), |
7130 | // we may just compare the last 64 bytes. |
7131 | // |
7132 | addptr(result, -64); // it is safe, bc we just came from this area |
7133 | evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); |
7134 | evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); |
7135 | kortestql(k7, k7); |
7136 | jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare |
7137 | |
7138 | jmp(TRUE_LABEL); |
7139 | |
7140 | bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7141 | |
7142 | }//if (VM_Version::supports_avx512vlbw()) |
7143 | #endif //_LP64 |
7144 | |
7145 | vmovdqu(vec1, Address(ary1, limit, Address::times_1)); |
7146 | vmovdqu(vec2, Address(ary2, limit, Address::times_1)); |
7147 | vpxor(vec1, vec2); |
7148 | |
7149 | vptest(vec1, vec1); |
7150 | jcc(Assembler::notZero, FALSE_LABEL); |
7151 | addptr(limit, 32); |
7152 | jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); |
7153 | |
7154 | testl(result, result); |
7155 | jcc(Assembler::zero, TRUE_LABEL); |
7156 | |
7157 | vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); |
7158 | vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); |
7159 | vpxor(vec1, vec2); |
7160 | |
7161 | vptest(vec1, vec1); |
7162 | jccb(Assembler::notZero, FALSE_LABEL); |
7163 | jmpb(TRUE_LABEL); |
7164 | |
7165 | bind(COMPARE_TAIL); // limit is zero |
7166 | movl(limit, result); |
7167 | // Fallthru to tail compare |
7168 | } else if (UseSSE42Intrinsics) { |
7169 | // With SSE4.2, use double quad vector compare |
7170 | Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
7171 | |
7172 | // Compare 16-byte vectors |
7173 | andl(result, 0x0000000f); // tail count (in bytes) |
7174 | andl(limit, 0xfffffff0); // vector count (in bytes) |
7175 | jcc(Assembler::zero, COMPARE_TAIL); |
7176 | |
7177 | lea(ary1, Address(ary1, limit, Address::times_1)); |
7178 | lea(ary2, Address(ary2, limit, Address::times_1)); |
7179 | negptr(limit); |
7180 | |
7181 | bind(COMPARE_WIDE_VECTORS); |
7182 | movdqu(vec1, Address(ary1, limit, Address::times_1)); |
7183 | movdqu(vec2, Address(ary2, limit, Address::times_1)); |
7184 | pxor(vec1, vec2); |
7185 | |
7186 | ptest(vec1, vec1); |
7187 | jcc(Assembler::notZero, FALSE_LABEL); |
7188 | addptr(limit, 16); |
7189 | jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); |
7190 | |
7191 | testl(result, result); |
7192 | jcc(Assembler::zero, TRUE_LABEL); |
7193 | |
7194 | movdqu(vec1, Address(ary1, result, Address::times_1, -16)); |
7195 | movdqu(vec2, Address(ary2, result, Address::times_1, -16)); |
7196 | pxor(vec1, vec2); |
7197 | |
7198 | ptest(vec1, vec1); |
7199 | jccb(Assembler::notZero, FALSE_LABEL); |
7200 | jmpb(TRUE_LABEL); |
7201 | |
7202 | bind(COMPARE_TAIL); // limit is zero |
7203 | movl(limit, result); |
7204 | // Fallthru to tail compare |
7205 | } |
7206 | |
7207 | // Compare 4-byte vectors |
7208 | andl(limit, 0xfffffffc); // vector count (in bytes) |
7209 | jccb(Assembler::zero, COMPARE_CHAR); |
7210 | |
7211 | lea(ary1, Address(ary1, limit, Address::times_1)); |
7212 | lea(ary2, Address(ary2, limit, Address::times_1)); |
7213 | negptr(limit); |
7214 | |
7215 | bind(COMPARE_VECTORS); |
7216 | movl(chr, Address(ary1, limit, Address::times_1)); |
7217 | cmpl(chr, Address(ary2, limit, Address::times_1)); |
7218 | jccb(Assembler::notEqual, FALSE_LABEL); |
7219 | addptr(limit, 4); |
7220 | jcc(Assembler::notZero, COMPARE_VECTORS); |
7221 | |
7222 | // Compare trailing char (final 2 bytes), if any |
7223 | bind(COMPARE_CHAR); |
7224 | testl(result, 0x2); // tail char |
7225 | jccb(Assembler::zero, COMPARE_BYTE); |
7226 | load_unsigned_short(chr, Address(ary1, 0)); |
7227 | load_unsigned_short(limit, Address(ary2, 0)); |
7228 | cmpl(chr, limit); |
7229 | jccb(Assembler::notEqual, FALSE_LABEL); |
7230 | |
7231 | if (is_array_equ && is_char) { |
7232 | bind(COMPARE_BYTE); |
7233 | } else { |
7234 | lea(ary1, Address(ary1, 2)); |
7235 | lea(ary2, Address(ary2, 2)); |
7236 | |
7237 | bind(COMPARE_BYTE); |
7238 | testl(result, 0x1); // tail byte |
7239 | jccb(Assembler::zero, TRUE_LABEL); |
7240 | load_unsigned_byte(chr, Address(ary1, 0)); |
7241 | load_unsigned_byte(limit, Address(ary2, 0)); |
7242 | cmpl(chr, limit); |
7243 | jccb(Assembler::notEqual, FALSE_LABEL); |
7244 | } |
7245 | bind(TRUE_LABEL); |
7246 | movl(result, 1); // return true |
7247 | jmpb(DONE); |
7248 | |
7249 | bind(FALSE_LABEL); |
7250 | xorl(result, result); // return false |
7251 | |
7252 | // That's it |
7253 | bind(DONE); |
7254 | if (UseAVX >= 2) { |
7255 | // clean upper bits of YMM registers |
7256 | vpxor(vec1, vec1); |
7257 | vpxor(vec2, vec2); |
7258 | } |
7259 | } |
7260 | |
7261 | #endif |
7262 | |
7263 | void MacroAssembler::generate_fill(BasicType t, bool aligned, |
7264 | Register to, Register value, Register count, |
7265 | Register rtmp, XMMRegister xtmp) { |
7266 | ShortBranchVerifier sbv(this); |
7267 | assert_different_registers(to, value, count, rtmp); |
7268 | Label L_exit; |
7269 | Label L_fill_2_bytes, L_fill_4_bytes; |
7270 | |
7271 | int shift = -1; |
7272 | switch (t) { |
7273 | case T_BYTE: |
7274 | shift = 2; |
7275 | break; |
7276 | case T_SHORT: |
7277 | shift = 1; |
7278 | break; |
7279 | case T_INT: |
7280 | shift = 0; |
7281 | break; |
7282 | default: ShouldNotReachHere(); |
7283 | } |
7284 | |
7285 | if (t == T_BYTE) { |
7286 | andl(value, 0xff); |
7287 | movl(rtmp, value); |
7288 | shll(rtmp, 8); |
7289 | orl(value, rtmp); |
7290 | } |
7291 | if (t == T_SHORT) { |
7292 | andl(value, 0xffff); |
7293 | } |
7294 | if (t == T_BYTE || t == T_SHORT) { |
7295 | movl(rtmp, value); |
7296 | shll(rtmp, 16); |
7297 | orl(value, rtmp); |
7298 | } |
7299 | |
7300 | cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element |
7301 | jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp |
7302 | if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { |
7303 | Label L_skip_align2; |
7304 | // align source address at 4 bytes address boundary |
7305 | if (t == T_BYTE) { |
7306 | Label L_skip_align1; |
7307 | // One byte misalignment happens only for byte arrays |
7308 | testptr(to, 1); |
7309 | jccb(Assembler::zero, L_skip_align1); |
7310 | movb(Address(to, 0), value); |
7311 | increment(to); |
7312 | decrement(count); |
7313 | BIND(L_skip_align1); |
7314 | } |
7315 | // Two bytes misalignment happens only for byte and short (char) arrays |
7316 | testptr(to, 2); |
7317 | jccb(Assembler::zero, L_skip_align2); |
7318 | movw(Address(to, 0), value); |
7319 | addptr(to, 2); |
7320 | subl(count, 1<<(shift-1)); |
7321 | BIND(L_skip_align2); |
7322 | } |
7323 | if (UseSSE < 2) { |
7324 | Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; |
7325 | // Fill 32-byte chunks |
7326 | subl(count, 8 << shift); |
7327 | jcc(Assembler::less, L_check_fill_8_bytes); |
7328 | align(16); |
7329 | |
7330 | BIND(L_fill_32_bytes_loop); |
7331 | |
7332 | for (int i = 0; i < 32; i += 4) { |
7333 | movl(Address(to, i), value); |
7334 | } |
7335 | |
7336 | addptr(to, 32); |
7337 | subl(count, 8 << shift); |
7338 | jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); |
7339 | BIND(L_check_fill_8_bytes); |
7340 | addl(count, 8 << shift); |
7341 | jccb(Assembler::zero, L_exit); |
7342 | jmpb(L_fill_8_bytes); |
7343 | |
7344 | // |
7345 | // length is too short, just fill qwords |
7346 | // |
7347 | BIND(L_fill_8_bytes_loop); |
7348 | movl(Address(to, 0), value); |
7349 | movl(Address(to, 4), value); |
7350 | addptr(to, 8); |
7351 | BIND(L_fill_8_bytes); |
7352 | subl(count, 1 << (shift + 1)); |
7353 | jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); |
7354 | // fall through to fill 4 bytes |
7355 | } else { |
7356 | Label L_fill_32_bytes; |
7357 | if (!UseUnalignedLoadStores) { |
7358 | // align to 8 bytes, we know we are 4 byte aligned to start |
7359 | testptr(to, 4); |
7360 | jccb(Assembler::zero, L_fill_32_bytes); |
7361 | movl(Address(to, 0), value); |
7362 | addptr(to, 4); |
7363 | subl(count, 1<<shift); |
7364 | } |
7365 | BIND(L_fill_32_bytes); |
7366 | { |
7367 | assert( UseSSE >= 2, "supported cpu only" ); |
7368 | Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; |
7369 | movdl(xtmp, value); |
7370 | if (UseAVX > 2 && UseUnalignedLoadStores) { |
7371 | // Fill 64-byte chunks |
7372 | Label L_fill_64_bytes_loop, L_check_fill_32_bytes; |
7373 | vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); |
7374 | |
7375 | subl(count, 16 << shift); |
7376 | jcc(Assembler::less, L_check_fill_32_bytes); |
7377 | align(16); |
7378 | |
7379 | BIND(L_fill_64_bytes_loop); |
7380 | evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); |
7381 | addptr(to, 64); |
7382 | subl(count, 16 << shift); |
7383 | jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); |
7384 | |
7385 | BIND(L_check_fill_32_bytes); |
7386 | addl(count, 8 << shift); |
7387 | jccb(Assembler::less, L_check_fill_8_bytes); |
7388 | vmovdqu(Address(to, 0), xtmp); |
7389 | addptr(to, 32); |
7390 | subl(count, 8 << shift); |
7391 | |
7392 | BIND(L_check_fill_8_bytes); |
7393 | } else if (UseAVX == 2 && UseUnalignedLoadStores) { |
7394 | // Fill 64-byte chunks |
7395 | Label L_fill_64_bytes_loop, L_check_fill_32_bytes; |
7396 | vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); |
7397 | |
7398 | subl(count, 16 << shift); |
7399 | jcc(Assembler::less, L_check_fill_32_bytes); |
7400 | align(16); |
7401 | |
7402 | BIND(L_fill_64_bytes_loop); |
7403 | vmovdqu(Address(to, 0), xtmp); |
7404 | vmovdqu(Address(to, 32), xtmp); |
7405 | addptr(to, 64); |
7406 | subl(count, 16 << shift); |
7407 | jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); |
7408 | |
7409 | BIND(L_check_fill_32_bytes); |
7410 | addl(count, 8 << shift); |
7411 | jccb(Assembler::less, L_check_fill_8_bytes); |
7412 | vmovdqu(Address(to, 0), xtmp); |
7413 | addptr(to, 32); |
7414 | subl(count, 8 << shift); |
7415 | |
7416 | BIND(L_check_fill_8_bytes); |
7417 | // clean upper bits of YMM registers |
7418 | movdl(xtmp, value); |
7419 | pshufd(xtmp, xtmp, 0); |
7420 | } else { |
7421 | // Fill 32-byte chunks |
7422 | pshufd(xtmp, xtmp, 0); |
7423 | |
7424 | subl(count, 8 << shift); |
7425 | jcc(Assembler::less, L_check_fill_8_bytes); |
7426 | align(16); |
7427 | |
7428 | BIND(L_fill_32_bytes_loop); |
7429 | |
7430 | if (UseUnalignedLoadStores) { |
7431 | movdqu(Address(to, 0), xtmp); |
7432 | movdqu(Address(to, 16), xtmp); |
7433 | } else { |
7434 | movq(Address(to, 0), xtmp); |
7435 | movq(Address(to, 8), xtmp); |
7436 | movq(Address(to, 16), xtmp); |
7437 | movq(Address(to, 24), xtmp); |
7438 | } |
7439 | |
7440 | addptr(to, 32); |
7441 | subl(count, 8 << shift); |
7442 | jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); |
7443 | |
7444 | BIND(L_check_fill_8_bytes); |
7445 | } |
7446 | addl(count, 8 << shift); |
7447 | jccb(Assembler::zero, L_exit); |
7448 | jmpb(L_fill_8_bytes); |
7449 | |
7450 | // |
7451 | // length is too short, just fill qwords |
7452 | // |
7453 | BIND(L_fill_8_bytes_loop); |
7454 | movq(Address(to, 0), xtmp); |
7455 | addptr(to, 8); |
7456 | BIND(L_fill_8_bytes); |
7457 | subl(count, 1 << (shift + 1)); |
7458 | jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); |
7459 | } |
7460 | } |
7461 | // fill trailing 4 bytes |
7462 | BIND(L_fill_4_bytes); |
7463 | testl(count, 1<<shift); |
7464 | jccb(Assembler::zero, L_fill_2_bytes); |
7465 | movl(Address(to, 0), value); |
7466 | if (t == T_BYTE || t == T_SHORT) { |
7467 | Label L_fill_byte; |
7468 | addptr(to, 4); |
7469 | BIND(L_fill_2_bytes); |
7470 | // fill trailing 2 bytes |
7471 | testl(count, 1<<(shift-1)); |
7472 | jccb(Assembler::zero, L_fill_byte); |
7473 | movw(Address(to, 0), value); |
7474 | if (t == T_BYTE) { |
7475 | addptr(to, 2); |
7476 | BIND(L_fill_byte); |
7477 | // fill trailing byte |
7478 | testl(count, 1); |
7479 | jccb(Assembler::zero, L_exit); |
7480 | movb(Address(to, 0), value); |
7481 | } else { |
7482 | BIND(L_fill_byte); |
7483 | } |
7484 | } else { |
7485 | BIND(L_fill_2_bytes); |
7486 | } |
7487 | BIND(L_exit); |
7488 | } |
7489 | |
7490 | // encode char[] to byte[] in ISO_8859_1 |
7491 | //@HotSpotIntrinsicCandidate |
7492 | //private static int implEncodeISOArray(byte[] sa, int sp, |
7493 | //byte[] da, int dp, int len) { |
7494 | // int i = 0; |
7495 | // for (; i < len; i++) { |
7496 | // char c = StringUTF16.getChar(sa, sp++); |
7497 | // if (c > '\u00FF') |
7498 | // break; |
7499 | // da[dp++] = (byte)c; |
7500 | // } |
7501 | // return i; |
7502 | //} |
7503 | void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, |
7504 | XMMRegister tmp1Reg, XMMRegister tmp2Reg, |
7505 | XMMRegister tmp3Reg, XMMRegister tmp4Reg, |
7506 | Register tmp5, Register result) { |
7507 | |
7508 | // rsi: src |
7509 | // rdi: dst |
7510 | // rdx: len |
7511 | // rcx: tmp5 |
7512 | // rax: result |
7513 | ShortBranchVerifier sbv(this); |
7514 | assert_different_registers(src, dst, len, tmp5, result); |
7515 | Label L_done, L_copy_1_char, L_copy_1_char_exit; |
7516 | |
7517 | // set result |
7518 | xorl(result, result); |
7519 | // check for zero length |
7520 | testl(len, len); |
7521 | jcc(Assembler::zero, L_done); |
7522 | |
7523 | movl(result, len); |
7524 | |
7525 | // Setup pointers |
7526 | lea(src, Address(src, len, Address::times_2)); // char[] |
7527 | lea(dst, Address(dst, len, Address::times_1)); // byte[] |
7528 | negptr(len); |
7529 | |
7530 | if (UseSSE42Intrinsics || UseAVX >= 2) { |
7531 | Label L_copy_8_chars, L_copy_8_chars_exit; |
7532 | Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; |
7533 | |
7534 | if (UseAVX >= 2) { |
7535 | Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; |
7536 | movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector |
7537 | movdl(tmp1Reg, tmp5); |
7538 | vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); |
7539 | jmp(L_chars_32_check); |
7540 | |
7541 | bind(L_copy_32_chars); |
7542 | vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); |
7543 | vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); |
7544 | vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); |
7545 | vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector |
7546 | jccb(Assembler::notZero, L_copy_32_chars_exit); |
7547 | vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); |
7548 | vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); |
7549 | vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); |
7550 | |
7551 | bind(L_chars_32_check); |
7552 | addptr(len, 32); |
7553 | jcc(Assembler::lessEqual, L_copy_32_chars); |
7554 | |
7555 | bind(L_copy_32_chars_exit); |
7556 | subptr(len, 16); |
7557 | jccb(Assembler::greater, L_copy_16_chars_exit); |
7558 | |
7559 | } else if (UseSSE42Intrinsics) { |
7560 | movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector |
7561 | movdl(tmp1Reg, tmp5); |
7562 | pshufd(tmp1Reg, tmp1Reg, 0); |
7563 | jmpb(L_chars_16_check); |
7564 | } |
7565 | |
7566 | bind(L_copy_16_chars); |
7567 | if (UseAVX >= 2) { |
7568 | vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); |
7569 | vptest(tmp2Reg, tmp1Reg); |
7570 | jcc(Assembler::notZero, L_copy_16_chars_exit); |
7571 | vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); |
7572 | vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); |
7573 | } else { |
7574 | if (UseAVX > 0) { |
7575 | movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); |
7576 | movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); |
7577 | vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); |
7578 | } else { |
7579 | movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); |
7580 | por(tmp2Reg, tmp3Reg); |
7581 | movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); |
7582 | por(tmp2Reg, tmp4Reg); |
7583 | } |
7584 | ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector |
7585 | jccb(Assembler::notZero, L_copy_16_chars_exit); |
7586 | packuswb(tmp3Reg, tmp4Reg); |
7587 | } |
7588 | movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); |
7589 | |
7590 | bind(L_chars_16_check); |
7591 | addptr(len, 16); |
7592 | jcc(Assembler::lessEqual, L_copy_16_chars); |
7593 | |
7594 | bind(L_copy_16_chars_exit); |
7595 | if (UseAVX >= 2) { |
7596 | // clean upper bits of YMM registers |
7597 | vpxor(tmp2Reg, tmp2Reg); |
7598 | vpxor(tmp3Reg, tmp3Reg); |
7599 | vpxor(tmp4Reg, tmp4Reg); |
7600 | movdl(tmp1Reg, tmp5); |
7601 | pshufd(tmp1Reg, tmp1Reg, 0); |
7602 | } |
7603 | subptr(len, 8); |
7604 | jccb(Assembler::greater, L_copy_8_chars_exit); |
7605 | |
7606 | bind(L_copy_8_chars); |
7607 | movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); |
7608 | ptest(tmp3Reg, tmp1Reg); |
7609 | jccb(Assembler::notZero, L_copy_8_chars_exit); |
7610 | packuswb(tmp3Reg, tmp1Reg); |
7611 | movq(Address(dst, len, Address::times_1, -8), tmp3Reg); |
7612 | addptr(len, 8); |
7613 | jccb(Assembler::lessEqual, L_copy_8_chars); |
7614 | |
7615 | bind(L_copy_8_chars_exit); |
7616 | subptr(len, 8); |
7617 | jccb(Assembler::zero, L_done); |
7618 | } |
7619 | |
7620 | bind(L_copy_1_char); |
7621 | load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); |
7622 | testl(tmp5, 0xff00); // check if Unicode char |
7623 | jccb(Assembler::notZero, L_copy_1_char_exit); |
7624 | movb(Address(dst, len, Address::times_1, 0), tmp5); |
7625 | addptr(len, 1); |
7626 | jccb(Assembler::less, L_copy_1_char); |
7627 | |
7628 | bind(L_copy_1_char_exit); |
7629 | addptr(result, len); // len is negative count of not processed elements |
7630 | |
7631 | bind(L_done); |
7632 | } |
7633 | |
7634 | #ifdef _LP64 |
7635 | /** |
7636 | * Helper for multiply_to_len(). |
7637 | */ |
7638 | void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { |
7639 | addq(dest_lo, src1); |
7640 | adcq(dest_hi, 0); |
7641 | addq(dest_lo, src2); |
7642 | adcq(dest_hi, 0); |
7643 | } |
7644 | |
7645 | /** |
7646 | * Multiply 64 bit by 64 bit first loop. |
7647 | */ |
7648 | void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, |
7649 | Register y, Register y_idx, Register z, |
7650 | Register carry, Register product, |
7651 | Register idx, Register kdx) { |
7652 | // |
7653 | // jlong carry, x[], y[], z[]; |
7654 | // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { |
7655 | // huge_128 product = y[idx] * x[xstart] + carry; |
7656 | // z[kdx] = (jlong)product; |
7657 | // carry = (jlong)(product >>> 64); |
7658 | // } |
7659 | // z[xstart] = carry; |
7660 | // |
7661 | |
7662 | Label L_first_loop, L_first_loop_exit; |
7663 | Label L_one_x, L_one_y, L_multiply; |
7664 | |
7665 | decrementl(xstart); |
7666 | jcc(Assembler::negative, L_one_x); |
7667 | |
7668 | movq(x_xstart, Address(x, xstart, Address::times_4, 0)); |
7669 | rorq(x_xstart, 32); // convert big-endian to little-endian |
7670 | |
7671 | bind(L_first_loop); |
7672 | decrementl(idx); |
7673 | jcc(Assembler::negative, L_first_loop_exit); |
7674 | decrementl(idx); |
7675 | jcc(Assembler::negative, L_one_y); |
7676 | movq(y_idx, Address(y, idx, Address::times_4, 0)); |
7677 | rorq(y_idx, 32); // convert big-endian to little-endian |
7678 | bind(L_multiply); |
7679 | movq(product, x_xstart); |
7680 | mulq(y_idx); // product(rax) * y_idx -> rdx:rax |
7681 | addq(product, carry); |
7682 | adcq(rdx, 0); |
7683 | subl(kdx, 2); |
7684 | movl(Address(z, kdx, Address::times_4, 4), product); |
7685 | shrq(product, 32); |
7686 | movl(Address(z, kdx, Address::times_4, 0), product); |
7687 | movq(carry, rdx); |
7688 | jmp(L_first_loop); |
7689 | |
7690 | bind(L_one_y); |
7691 | movl(y_idx, Address(y, 0)); |
7692 | jmp(L_multiply); |
7693 | |
7694 | bind(L_one_x); |
7695 | movl(x_xstart, Address(x, 0)); |
7696 | jmp(L_first_loop); |
7697 | |
7698 | bind(L_first_loop_exit); |
7699 | } |
7700 | |
7701 | /** |
7702 | * Multiply 64 bit by 64 bit and add 128 bit. |
7703 | */ |
7704 | void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, |
7705 | Register yz_idx, Register idx, |
7706 | Register carry, Register product, int offset) { |
7707 | // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; |
7708 | // z[kdx] = (jlong)product; |
7709 | |
7710 | movq(yz_idx, Address(y, idx, Address::times_4, offset)); |
7711 | rorq(yz_idx, 32); // convert big-endian to little-endian |
7712 | movq(product, x_xstart); |
7713 | mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) |
7714 | movq(yz_idx, Address(z, idx, Address::times_4, offset)); |
7715 | rorq(yz_idx, 32); // convert big-endian to little-endian |
7716 | |
7717 | add2_with_carry(rdx, product, carry, yz_idx); |
7718 | |
7719 | movl(Address(z, idx, Address::times_4, offset+4), product); |
7720 | shrq(product, 32); |
7721 | movl(Address(z, idx, Address::times_4, offset), product); |
7722 | |
7723 | } |
7724 | |
7725 | /** |
7726 | * Multiply 128 bit by 128 bit. Unrolled inner loop. |
7727 | */ |
7728 | void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, |
7729 | Register yz_idx, Register idx, Register jdx, |
7730 | Register carry, Register product, |
7731 | Register carry2) { |
7732 | // jlong carry, x[], y[], z[]; |
7733 | // int kdx = ystart+1; |
7734 | // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop |
7735 | // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; |
7736 | // z[kdx+idx+1] = (jlong)product; |
7737 | // jlong carry2 = (jlong)(product >>> 64); |
7738 | // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; |
7739 | // z[kdx+idx] = (jlong)product; |
7740 | // carry = (jlong)(product >>> 64); |
7741 | // } |
7742 | // idx += 2; |
7743 | // if (idx > 0) { |
7744 | // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; |
7745 | // z[kdx+idx] = (jlong)product; |
7746 | // carry = (jlong)(product >>> 64); |
7747 | // } |
7748 | // |
7749 | |
7750 | Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; |
7751 | |
7752 | movl(jdx, idx); |
7753 | andl(jdx, 0xFFFFFFFC); |
7754 | shrl(jdx, 2); |
7755 | |
7756 | bind(L_third_loop); |
7757 | subl(jdx, 1); |
7758 | jcc(Assembler::negative, L_third_loop_exit); |
7759 | subl(idx, 4); |
7760 | |
7761 | multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); |
7762 | movq(carry2, rdx); |
7763 | |
7764 | multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); |
7765 | movq(carry, rdx); |
7766 | jmp(L_third_loop); |
7767 | |
7768 | bind (L_third_loop_exit); |
7769 | |
7770 | andl (idx, 0x3); |
7771 | jcc(Assembler::zero, L_post_third_loop_done); |
7772 | |
7773 | Label L_check_1; |
7774 | subl(idx, 2); |
7775 | jcc(Assembler::negative, L_check_1); |
7776 | |
7777 | multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); |
7778 | movq(carry, rdx); |
7779 | |
7780 | bind (L_check_1); |
7781 | addl (idx, 0x2); |
7782 | andl (idx, 0x1); |
7783 | subl(idx, 1); |
7784 | jcc(Assembler::negative, L_post_third_loop_done); |
7785 | |
7786 | movl(yz_idx, Address(y, idx, Address::times_4, 0)); |
7787 | movq(product, x_xstart); |
7788 | mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) |
7789 | movl(yz_idx, Address(z, idx, Address::times_4, 0)); |
7790 | |
7791 | add2_with_carry(rdx, product, yz_idx, carry); |
7792 | |
7793 | movl(Address(z, idx, Address::times_4, 0), product); |
7794 | shrq(product, 32); |
7795 | |
7796 | shlq(rdx, 32); |
7797 | orq(product, rdx); |
7798 | movq(carry, product); |
7799 | |
7800 | bind(L_post_third_loop_done); |
7801 | } |
7802 | |
7803 | /** |
7804 | * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. |
7805 | * |
7806 | */ |
7807 | void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, |
7808 | Register carry, Register carry2, |
7809 | Register idx, Register jdx, |
7810 | Register yz_idx1, Register yz_idx2, |
7811 | Register tmp, Register tmp3, Register tmp4) { |
7812 | assert(UseBMI2Instructions, "should be used only when BMI2 is available" ); |
7813 | |
7814 | // jlong carry, x[], y[], z[]; |
7815 | // int kdx = ystart+1; |
7816 | // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop |
7817 | // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; |
7818 | // jlong carry2 = (jlong)(tmp3 >>> 64); |
7819 | // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; |
7820 | // carry = (jlong)(tmp4 >>> 64); |
7821 | // z[kdx+idx+1] = (jlong)tmp3; |
7822 | // z[kdx+idx] = (jlong)tmp4; |
7823 | // } |
7824 | // idx += 2; |
7825 | // if (idx > 0) { |
7826 | // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; |
7827 | // z[kdx+idx] = (jlong)yz_idx1; |
7828 | // carry = (jlong)(yz_idx1 >>> 64); |
7829 | // } |
7830 | // |
7831 | |
7832 | Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; |
7833 | |
7834 | movl(jdx, idx); |
7835 | andl(jdx, 0xFFFFFFFC); |
7836 | shrl(jdx, 2); |
7837 | |
7838 | bind(L_third_loop); |
7839 | subl(jdx, 1); |
7840 | jcc(Assembler::negative, L_third_loop_exit); |
7841 | subl(idx, 4); |
7842 | |
7843 | movq(yz_idx1, Address(y, idx, Address::times_4, 8)); |
7844 | rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian |
7845 | movq(yz_idx2, Address(y, idx, Address::times_4, 0)); |
7846 | rorxq(yz_idx2, yz_idx2, 32); |
7847 | |
7848 | mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 |
7849 | mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp |
7850 | |
7851 | movq(yz_idx1, Address(z, idx, Address::times_4, 8)); |
7852 | rorxq(yz_idx1, yz_idx1, 32); |
7853 | movq(yz_idx2, Address(z, idx, Address::times_4, 0)); |
7854 | rorxq(yz_idx2, yz_idx2, 32); |
7855 | |
7856 | if (VM_Version::supports_adx()) { |
7857 | adcxq(tmp3, carry); |
7858 | adoxq(tmp3, yz_idx1); |
7859 | |
7860 | adcxq(tmp4, tmp); |
7861 | adoxq(tmp4, yz_idx2); |
7862 | |
7863 | movl(carry, 0); // does not affect flags |
7864 | adcxq(carry2, carry); |
7865 | adoxq(carry2, carry); |
7866 | } else { |
7867 | add2_with_carry(tmp4, tmp3, carry, yz_idx1); |
7868 | add2_with_carry(carry2, tmp4, tmp, yz_idx2); |
7869 | } |
7870 | movq(carry, carry2); |
7871 | |
7872 | movl(Address(z, idx, Address::times_4, 12), tmp3); |
7873 | shrq(tmp3, 32); |
7874 | movl(Address(z, idx, Address::times_4, 8), tmp3); |
7875 | |
7876 | movl(Address(z, idx, Address::times_4, 4), tmp4); |
7877 | shrq(tmp4, 32); |
7878 | movl(Address(z, idx, Address::times_4, 0), tmp4); |
7879 | |
7880 | jmp(L_third_loop); |
7881 | |
7882 | bind (L_third_loop_exit); |
7883 | |
7884 | andl (idx, 0x3); |
7885 | jcc(Assembler::zero, L_post_third_loop_done); |
7886 | |
7887 | Label L_check_1; |
7888 | subl(idx, 2); |
7889 | jcc(Assembler::negative, L_check_1); |
7890 | |
7891 | movq(yz_idx1, Address(y, idx, Address::times_4, 0)); |
7892 | rorxq(yz_idx1, yz_idx1, 32); |
7893 | mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 |
7894 | movq(yz_idx2, Address(z, idx, Address::times_4, 0)); |
7895 | rorxq(yz_idx2, yz_idx2, 32); |
7896 | |
7897 | add2_with_carry(tmp4, tmp3, carry, yz_idx2); |
7898 | |
7899 | movl(Address(z, idx, Address::times_4, 4), tmp3); |
7900 | shrq(tmp3, 32); |
7901 | movl(Address(z, idx, Address::times_4, 0), tmp3); |
7902 | movq(carry, tmp4); |
7903 | |
7904 | bind (L_check_1); |
7905 | addl (idx, 0x2); |
7906 | andl (idx, 0x1); |
7907 | subl(idx, 1); |
7908 | jcc(Assembler::negative, L_post_third_loop_done); |
7909 | movl(tmp4, Address(y, idx, Address::times_4, 0)); |
7910 | mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 |
7911 | movl(tmp4, Address(z, idx, Address::times_4, 0)); |
7912 | |
7913 | add2_with_carry(carry2, tmp3, tmp4, carry); |
7914 | |
7915 | movl(Address(z, idx, Address::times_4, 0), tmp3); |
7916 | shrq(tmp3, 32); |
7917 | |
7918 | shlq(carry2, 32); |
7919 | orq(tmp3, carry2); |
7920 | movq(carry, tmp3); |
7921 | |
7922 | bind(L_post_third_loop_done); |
7923 | } |
7924 | |
7925 | /** |
7926 | * Code for BigInteger::multiplyToLen() instrinsic. |
7927 | * |
7928 | * rdi: x |
7929 | * rax: xlen |
7930 | * rsi: y |
7931 | * rcx: ylen |
7932 | * r8: z |
7933 | * r11: zlen |
7934 | * r12: tmp1 |
7935 | * r13: tmp2 |
7936 | * r14: tmp3 |
7937 | * r15: tmp4 |
7938 | * rbx: tmp5 |
7939 | * |
7940 | */ |
7941 | void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, |
7942 | Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { |
7943 | ShortBranchVerifier sbv(this); |
7944 | assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); |
7945 | |
7946 | push(tmp1); |
7947 | push(tmp2); |
7948 | push(tmp3); |
7949 | push(tmp4); |
7950 | push(tmp5); |
7951 | |
7952 | push(xlen); |
7953 | push(zlen); |
7954 | |
7955 | const Register idx = tmp1; |
7956 | const Register kdx = tmp2; |
7957 | const Register xstart = tmp3; |
7958 | |
7959 | const Register y_idx = tmp4; |
7960 | const Register carry = tmp5; |
7961 | const Register product = xlen; |
7962 | const Register x_xstart = zlen; // reuse register |
7963 | |
7964 | // First Loop. |
7965 | // |
7966 | // final static long LONG_MASK = 0xffffffffL; |
7967 | // int xstart = xlen - 1; |
7968 | // int ystart = ylen - 1; |
7969 | // long carry = 0; |
7970 | // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { |
7971 | // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; |
7972 | // z[kdx] = (int)product; |
7973 | // carry = product >>> 32; |
7974 | // } |
7975 | // z[xstart] = (int)carry; |
7976 | // |
7977 | |
7978 | movl(idx, ylen); // idx = ylen; |
7979 | movl(kdx, zlen); // kdx = xlen+ylen; |
7980 | xorq(carry, carry); // carry = 0; |
7981 | |
7982 | Label L_done; |
7983 | |
7984 | movl(xstart, xlen); |
7985 | decrementl(xstart); |
7986 | jcc(Assembler::negative, L_done); |
7987 | |
7988 | multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); |
7989 | |
7990 | Label L_second_loop; |
7991 | testl(kdx, kdx); |
7992 | jcc(Assembler::zero, L_second_loop); |
7993 | |
7994 | Label L_carry; |
7995 | subl(kdx, 1); |
7996 | jcc(Assembler::zero, L_carry); |
7997 | |
7998 | movl(Address(z, kdx, Address::times_4, 0), carry); |
7999 | shrq(carry, 32); |
8000 | subl(kdx, 1); |
8001 | |
8002 | bind(L_carry); |
8003 | movl(Address(z, kdx, Address::times_4, 0), carry); |
8004 | |
8005 | // Second and third (nested) loops. |
8006 | // |
8007 | // for (int i = xstart-1; i >= 0; i--) { // Second loop |
8008 | // carry = 0; |
8009 | // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop |
8010 | // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + |
8011 | // (z[k] & LONG_MASK) + carry; |
8012 | // z[k] = (int)product; |
8013 | // carry = product >>> 32; |
8014 | // } |
8015 | // z[i] = (int)carry; |
8016 | // } |
8017 | // |
8018 | // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx |
8019 | |
8020 | const Register jdx = tmp1; |
8021 | |
8022 | bind(L_second_loop); |
8023 | xorl(carry, carry); // carry = 0; |
8024 | movl(jdx, ylen); // j = ystart+1 |
8025 | |
8026 | subl(xstart, 1); // i = xstart-1; |
8027 | jcc(Assembler::negative, L_done); |
8028 | |
8029 | push (z); |
8030 | |
8031 | Label L_last_x; |
8032 | lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j |
8033 | subl(xstart, 1); // i = xstart-1; |
8034 | jcc(Assembler::negative, L_last_x); |
8035 | |
8036 | if (UseBMI2Instructions) { |
8037 | movq(rdx, Address(x, xstart, Address::times_4, 0)); |
8038 | rorxq(rdx, rdx, 32); // convert big-endian to little-endian |
8039 | } else { |
8040 | movq(x_xstart, Address(x, xstart, Address::times_4, 0)); |
8041 | rorq(x_xstart, 32); // convert big-endian to little-endian |
8042 | } |
8043 | |
8044 | Label L_third_loop_prologue; |
8045 | bind(L_third_loop_prologue); |
8046 | |
8047 | push (x); |
8048 | push (xstart); |
8049 | push (ylen); |
8050 | |
8051 | |
8052 | if (UseBMI2Instructions) { |
8053 | multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); |
8054 | } else { // !UseBMI2Instructions |
8055 | multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); |
8056 | } |
8057 | |
8058 | pop(ylen); |
8059 | pop(xlen); |
8060 | pop(x); |
8061 | pop(z); |
8062 | |
8063 | movl(tmp3, xlen); |
8064 | addl(tmp3, 1); |
8065 | movl(Address(z, tmp3, Address::times_4, 0), carry); |
8066 | subl(tmp3, 1); |
8067 | jccb(Assembler::negative, L_done); |
8068 | |
8069 | shrq(carry, 32); |
8070 | movl(Address(z, tmp3, Address::times_4, 0), carry); |
8071 | jmp(L_second_loop); |
8072 | |
8073 | // Next infrequent code is moved outside loops. |
8074 | bind(L_last_x); |
8075 | if (UseBMI2Instructions) { |
8076 | movl(rdx, Address(x, 0)); |
8077 | } else { |
8078 | movl(x_xstart, Address(x, 0)); |
8079 | } |
8080 | jmp(L_third_loop_prologue); |
8081 | |
8082 | bind(L_done); |
8083 | |
8084 | pop(zlen); |
8085 | pop(xlen); |
8086 | |
8087 | pop(tmp5); |
8088 | pop(tmp4); |
8089 | pop(tmp3); |
8090 | pop(tmp2); |
8091 | pop(tmp1); |
8092 | } |
8093 | |
8094 | void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, |
8095 | Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ |
8096 | assert(UseSSE42Intrinsics, "SSE4.2 must be enabled." ); |
8097 | Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; |
8098 | Label VECTOR8_TAIL, VECTOR4_TAIL; |
8099 | Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; |
8100 | Label SAME_TILL_END, DONE; |
8101 | Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; |
8102 | |
8103 | //scale is in rcx in both Win64 and Unix |
8104 | ShortBranchVerifier sbv(this); |
8105 | |
8106 | shlq(length); |
8107 | xorq(result, result); |
8108 | |
8109 | if ((UseAVX > 2) && |
8110 | VM_Version::supports_avx512vlbw()) { |
8111 | Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; |
8112 | |
8113 | cmpq(length, 64); |
8114 | jcc(Assembler::less, VECTOR32_TAIL); |
8115 | movq(tmp1, length); |
8116 | andq(tmp1, 0x3F); // tail count |
8117 | andq(length, ~(0x3F)); //vector count |
8118 | |
8119 | bind(VECTOR64_LOOP); |
8120 | // AVX512 code to compare 64 byte vectors. |
8121 | evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); |
8122 | evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); |
8123 | kortestql(k7, k7); |
8124 | jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch |
8125 | addq(result, 64); |
8126 | subq(length, 64); |
8127 | jccb(Assembler::notZero, VECTOR64_LOOP); |
8128 | |
8129 | //bind(VECTOR64_TAIL); |
8130 | testq(tmp1, tmp1); |
8131 | jcc(Assembler::zero, SAME_TILL_END); |
8132 | |
8133 | //bind(VECTOR64_TAIL); |
8134 | // AVX512 code to compare upto 63 byte vectors. |
8135 | mov64(tmp2, 0xFFFFFFFFFFFFFFFF); |
8136 | shlxq(tmp2, tmp2, tmp1); |
8137 | notq(tmp2); |
8138 | kmovql(k3, tmp2); |
8139 | |
8140 | evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit); |
8141 | evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); |
8142 | |
8143 | ktestql(k7, k3); |
8144 | jcc(Assembler::below, SAME_TILL_END); // not mismatch |
8145 | |
8146 | bind(VECTOR64_NOT_EQUAL); |
8147 | kmovql(tmp1, k7); |
8148 | notq(tmp1); |
8149 | tzcntq(tmp1, tmp1); |
8150 | addq(result, tmp1); |
8151 | shrq(result); |
8152 | jmp(DONE); |
8153 | bind(VECTOR32_TAIL); |
8154 | } |
8155 | |
8156 | cmpq(length, 8); |
8157 | jcc(Assembler::equal, VECTOR8_LOOP); |
8158 | jcc(Assembler::less, VECTOR4_TAIL); |
8159 | |
8160 | if (UseAVX >= 2) { |
8161 | Label VECTOR16_TAIL, VECTOR32_LOOP; |
8162 | |
8163 | cmpq(length, 16); |
8164 | jcc(Assembler::equal, VECTOR16_LOOP); |
8165 | jcc(Assembler::less, VECTOR8_LOOP); |
8166 | |
8167 | cmpq(length, 32); |
8168 | jccb(Assembler::less, VECTOR16_TAIL); |
8169 | |
8170 | subq(length, 32); |
8171 | bind(VECTOR32_LOOP); |
8172 | vmovdqu(rymm0, Address(obja, result)); |
8173 | vmovdqu(rymm1, Address(objb, result)); |
8174 | vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); |
8175 | vptest(rymm2, rymm2); |
8176 | jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found |
8177 | addq(result, 32); |
8178 | subq(length, 32); |
8179 | jcc(Assembler::greaterEqual, VECTOR32_LOOP); |
8180 | addq(length, 32); |
8181 | jcc(Assembler::equal, SAME_TILL_END); |
8182 | //falling through if less than 32 bytes left //close the branch here. |
8183 | |
8184 | bind(VECTOR16_TAIL); |
8185 | cmpq(length, 16); |
8186 | jccb(Assembler::less, VECTOR8_TAIL); |
8187 | bind(VECTOR16_LOOP); |
8188 | movdqu(rymm0, Address(obja, result)); |
8189 | movdqu(rymm1, Address(objb, result)); |
8190 | vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); |
8191 | ptest(rymm2, rymm2); |
8192 | jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found |
8193 | addq(result, 16); |
8194 | subq(length, 16); |
8195 | jcc(Assembler::equal, SAME_TILL_END); |
8196 | //falling through if less than 16 bytes left |
8197 | } else {//regular intrinsics |
8198 | |
8199 | cmpq(length, 16); |
8200 | jccb(Assembler::less, VECTOR8_TAIL); |
8201 | |
8202 | subq(length, 16); |
8203 | bind(VECTOR16_LOOP); |
8204 | movdqu(rymm0, Address(obja, result)); |
8205 | movdqu(rymm1, Address(objb, result)); |
8206 | pxor(rymm0, rymm1); |
8207 | ptest(rymm0, rymm0); |
8208 | jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found |
8209 | addq(result, 16); |
8210 | subq(length, 16); |
8211 | jccb(Assembler::greaterEqual, VECTOR16_LOOP); |
8212 | addq(length, 16); |
8213 | jcc(Assembler::equal, SAME_TILL_END); |
8214 | //falling through if less than 16 bytes left |
8215 | } |
8216 | |
8217 | bind(VECTOR8_TAIL); |
8218 | cmpq(length, 8); |
8219 | jccb(Assembler::less, VECTOR4_TAIL); |
8220 | bind(VECTOR8_LOOP); |
8221 | movq(tmp1, Address(obja, result)); |
8222 | movq(tmp2, Address(objb, result)); |
8223 | xorq(tmp1, tmp2); |
8224 | testq(tmp1, tmp1); |
8225 | jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found |
8226 | addq(result, 8); |
8227 | subq(length, 8); |
8228 | jcc(Assembler::equal, SAME_TILL_END); |
8229 | //falling through if less than 8 bytes left |
8230 | |
8231 | bind(VECTOR4_TAIL); |
8232 | cmpq(length, 4); |
8233 | jccb(Assembler::less, BYTES_TAIL); |
8234 | bind(VECTOR4_LOOP); |
8235 | movl(tmp1, Address(obja, result)); |
8236 | xorl(tmp1, Address(objb, result)); |
8237 | testl(tmp1, tmp1); |
8238 | jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found |
8239 | addq(result, 4); |
8240 | subq(length, 4); |
8241 | jcc(Assembler::equal, SAME_TILL_END); |
8242 | //falling through if less than 4 bytes left |
8243 | |
8244 | bind(BYTES_TAIL); |
8245 | bind(BYTES_LOOP); |
8246 | load_unsigned_byte(tmp1, Address(obja, result)); |
8247 | load_unsigned_byte(tmp2, Address(objb, result)); |
8248 | xorl(tmp1, tmp2); |
8249 | testl(tmp1, tmp1); |
8250 | jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found |
8251 | decq(length); |
8252 | jcc(Assembler::zero, SAME_TILL_END); |
8253 | incq(result); |
8254 | load_unsigned_byte(tmp1, Address(obja, result)); |
8255 | load_unsigned_byte(tmp2, Address(objb, result)); |
8256 | xorl(tmp1, tmp2); |
8257 | testl(tmp1, tmp1); |
8258 | jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found |
8259 | decq(length); |
8260 | jcc(Assembler::zero, SAME_TILL_END); |
8261 | incq(result); |
8262 | load_unsigned_byte(tmp1, Address(obja, result)); |
8263 | load_unsigned_byte(tmp2, Address(objb, result)); |
8264 | xorl(tmp1, tmp2); |
8265 | testl(tmp1, tmp1); |
8266 | jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found |
8267 | jmp(SAME_TILL_END); |
8268 | |
8269 | if (UseAVX >= 2) { |
8270 | bind(VECTOR32_NOT_EQUAL); |
8271 | vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); |
8272 | vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); |
8273 | vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); |
8274 | vpmovmskb(tmp1, rymm0); |
8275 | bsfq(tmp1, tmp1); |
8276 | addq(result, tmp1); |
8277 | shrq(result); |
8278 | jmp(DONE); |
8279 | } |
8280 | |
8281 | bind(VECTOR16_NOT_EQUAL); |
8282 | if (UseAVX >= 2) { |
8283 | vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); |
8284 | vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); |
8285 | pxor(rymm0, rymm2); |
8286 | } else { |
8287 | pcmpeqb(rymm2, rymm2); |
8288 | pxor(rymm0, rymm1); |
8289 | pcmpeqb(rymm0, rymm1); |
8290 | pxor(rymm0, rymm2); |
8291 | } |
8292 | pmovmskb(tmp1, rymm0); |
8293 | bsfq(tmp1, tmp1); |
8294 | addq(result, tmp1); |
8295 | shrq(result); |
8296 | jmpb(DONE); |
8297 | |
8298 | bind(VECTOR8_NOT_EQUAL); |
8299 | bind(VECTOR4_NOT_EQUAL); |
8300 | bsfq(tmp1, tmp1); |
8301 | shrq(tmp1, 3); |
8302 | addq(result, tmp1); |
8303 | bind(BYTES_NOT_EQUAL); |
8304 | shrq(result); |
8305 | jmpb(DONE); |
8306 | |
8307 | bind(SAME_TILL_END); |
8308 | mov64(result, -1); |
8309 | |
8310 | bind(DONE); |
8311 | } |
8312 | |
8313 | //Helper functions for square_to_len() |
8314 | |
8315 | /** |
8316 | * Store the squares of x[], right shifted one bit (divided by 2) into z[] |
8317 | * Preserves x and z and modifies rest of the registers. |
8318 | */ |
8319 | void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
8320 | // Perform square and right shift by 1 |
8321 | // Handle odd xlen case first, then for even xlen do the following |
8322 | // jlong carry = 0; |
8323 | // for (int j=0, i=0; j < xlen; j+=2, i+=4) { |
8324 | // huge_128 product = x[j:j+1] * x[j:j+1]; |
8325 | // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); |
8326 | // z[i+2:i+3] = (jlong)(product >>> 1); |
8327 | // carry = (jlong)product; |
8328 | // } |
8329 | |
8330 | xorq(tmp5, tmp5); // carry |
8331 | xorq(rdxReg, rdxReg); |
8332 | xorl(tmp1, tmp1); // index for x |
8333 | xorl(tmp4, tmp4); // index for z |
8334 | |
8335 | Label L_first_loop, L_first_loop_exit; |
8336 | |
8337 | testl(xlen, 1); |
8338 | jccb(Assembler::zero, L_first_loop); //jump if xlen is even |
8339 | |
8340 | // Square and right shift by 1 the odd element using 32 bit multiply |
8341 | movl(raxReg, Address(x, tmp1, Address::times_4, 0)); |
8342 | imulq(raxReg, raxReg); |
8343 | shrq(raxReg, 1); |
8344 | adcq(tmp5, 0); |
8345 | movq(Address(z, tmp4, Address::times_4, 0), raxReg); |
8346 | incrementl(tmp1); |
8347 | addl(tmp4, 2); |
8348 | |
8349 | // Square and right shift by 1 the rest using 64 bit multiply |
8350 | bind(L_first_loop); |
8351 | cmpptr(tmp1, xlen); |
8352 | jccb(Assembler::equal, L_first_loop_exit); |
8353 | |
8354 | // Square |
8355 | movq(raxReg, Address(x, tmp1, Address::times_4, 0)); |
8356 | rorq(raxReg, 32); // convert big-endian to little-endian |
8357 | mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax |
8358 | |
8359 | // Right shift by 1 and save carry |
8360 | shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 |
8361 | rcrq(rdxReg, 1); |
8362 | rcrq(raxReg, 1); |
8363 | adcq(tmp5, 0); |
8364 | |
8365 | // Store result in z |
8366 | movq(Address(z, tmp4, Address::times_4, 0), rdxReg); |
8367 | movq(Address(z, tmp4, Address::times_4, 8), raxReg); |
8368 | |
8369 | // Update indices for x and z |
8370 | addl(tmp1, 2); |
8371 | addl(tmp4, 4); |
8372 | jmp(L_first_loop); |
8373 | |
8374 | bind(L_first_loop_exit); |
8375 | } |
8376 | |
8377 | |
8378 | /** |
8379 | * Perform the following multiply add operation using BMI2 instructions |
8380 | * carry:sum = sum + op1*op2 + carry |
8381 | * op2 should be in rdx |
8382 | * op2 is preserved, all other registers are modified |
8383 | */ |
8384 | void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { |
8385 | // assert op2 is rdx |
8386 | mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 |
8387 | addq(sum, carry); |
8388 | adcq(tmp2, 0); |
8389 | addq(sum, op1); |
8390 | adcq(tmp2, 0); |
8391 | movq(carry, tmp2); |
8392 | } |
8393 | |
8394 | /** |
8395 | * Perform the following multiply add operation: |
8396 | * carry:sum = sum + op1*op2 + carry |
8397 | * Preserves op1, op2 and modifies rest of registers |
8398 | */ |
8399 | void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { |
8400 | // rdx:rax = op1 * op2 |
8401 | movq(raxReg, op2); |
8402 | mulq(op1); |
8403 | |
8404 | // rdx:rax = sum + carry + rdx:rax |
8405 | addq(sum, carry); |
8406 | adcq(rdxReg, 0); |
8407 | addq(sum, raxReg); |
8408 | adcq(rdxReg, 0); |
8409 | |
8410 | // carry:sum = rdx:sum |
8411 | movq(carry, rdxReg); |
8412 | } |
8413 | |
8414 | /** |
8415 | * Add 64 bit long carry into z[] with carry propogation. |
8416 | * Preserves z and carry register values and modifies rest of registers. |
8417 | * |
8418 | */ |
8419 | void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { |
8420 | Label L_fourth_loop, L_fourth_loop_exit; |
8421 | |
8422 | movl(tmp1, 1); |
8423 | subl(zlen, 2); |
8424 | addq(Address(z, zlen, Address::times_4, 0), carry); |
8425 | |
8426 | bind(L_fourth_loop); |
8427 | jccb(Assembler::carryClear, L_fourth_loop_exit); |
8428 | subl(zlen, 2); |
8429 | jccb(Assembler::negative, L_fourth_loop_exit); |
8430 | addq(Address(z, zlen, Address::times_4, 0), tmp1); |
8431 | jmp(L_fourth_loop); |
8432 | bind(L_fourth_loop_exit); |
8433 | } |
8434 | |
8435 | /** |
8436 | * Shift z[] left by 1 bit. |
8437 | * Preserves x, len, z and zlen registers and modifies rest of the registers. |
8438 | * |
8439 | */ |
8440 | void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { |
8441 | |
8442 | Label L_fifth_loop, L_fifth_loop_exit; |
8443 | |
8444 | // Fifth loop |
8445 | // Perform primitiveLeftShift(z, zlen, 1) |
8446 | |
8447 | const Register prev_carry = tmp1; |
8448 | const Register new_carry = tmp4; |
8449 | const Register value = tmp2; |
8450 | const Register zidx = tmp3; |
8451 | |
8452 | // int zidx, carry; |
8453 | // long value; |
8454 | // carry = 0; |
8455 | // for (zidx = zlen-2; zidx >=0; zidx -= 2) { |
8456 | // (carry:value) = (z[i] << 1) | carry ; |
8457 | // z[i] = value; |
8458 | // } |
8459 | |
8460 | movl(zidx, zlen); |
8461 | xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register |
8462 | |
8463 | bind(L_fifth_loop); |
8464 | decl(zidx); // Use decl to preserve carry flag |
8465 | decl(zidx); |
8466 | jccb(Assembler::negative, L_fifth_loop_exit); |
8467 | |
8468 | if (UseBMI2Instructions) { |
8469 | movq(value, Address(z, zidx, Address::times_4, 0)); |
8470 | rclq(value, 1); |
8471 | rorxq(value, value, 32); |
8472 | movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
8473 | } |
8474 | else { |
8475 | // clear new_carry |
8476 | xorl(new_carry, new_carry); |
8477 | |
8478 | // Shift z[i] by 1, or in previous carry and save new carry |
8479 | movq(value, Address(z, zidx, Address::times_4, 0)); |
8480 | shlq(value, 1); |
8481 | adcl(new_carry, 0); |
8482 | |
8483 | orq(value, prev_carry); |
8484 | rorq(value, 0x20); |
8485 | movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
8486 | |
8487 | // Set previous carry = new carry |
8488 | movl(prev_carry, new_carry); |
8489 | } |
8490 | jmp(L_fifth_loop); |
8491 | |
8492 | bind(L_fifth_loop_exit); |
8493 | } |
8494 | |
8495 | |
8496 | /** |
8497 | * Code for BigInteger::squareToLen() intrinsic |
8498 | * |
8499 | * rdi: x |
8500 | * rsi: len |
8501 | * r8: z |
8502 | * rcx: zlen |
8503 | * r12: tmp1 |
8504 | * r13: tmp2 |
8505 | * r14: tmp3 |
8506 | * r15: tmp4 |
8507 | * rbx: tmp5 |
8508 | * |
8509 | */ |
8510 | void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
8511 | |
8512 | Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; |
8513 | push(tmp1); |
8514 | push(tmp2); |
8515 | push(tmp3); |
8516 | push(tmp4); |
8517 | push(tmp5); |
8518 | |
8519 | // First loop |
8520 | // Store the squares, right shifted one bit (i.e., divided by 2). |
8521 | square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); |
8522 | |
8523 | // Add in off-diagonal sums. |
8524 | // |
8525 | // Second, third (nested) and fourth loops. |
8526 | // zlen +=2; |
8527 | // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { |
8528 | // carry = 0; |
8529 | // long op2 = x[xidx:xidx+1]; |
8530 | // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { |
8531 | // k -= 2; |
8532 | // long op1 = x[j:j+1]; |
8533 | // long sum = z[k:k+1]; |
8534 | // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); |
8535 | // z[k:k+1] = sum; |
8536 | // } |
8537 | // add_one_64(z, k, carry, tmp_regs); |
8538 | // } |
8539 | |
8540 | const Register carry = tmp5; |
8541 | const Register sum = tmp3; |
8542 | const Register op1 = tmp4; |
8543 | Register op2 = tmp2; |
8544 | |
8545 | push(zlen); |
8546 | push(len); |
8547 | addl(zlen,2); |
8548 | bind(L_second_loop); |
8549 | xorq(carry, carry); |
8550 | subl(zlen, 4); |
8551 | subl(len, 2); |
8552 | push(zlen); |
8553 | push(len); |
8554 | cmpl(len, 0); |
8555 | jccb(Assembler::lessEqual, L_second_loop_exit); |
8556 | |
8557 | // Multiply an array by one 64 bit long. |
8558 | if (UseBMI2Instructions) { |
8559 | op2 = rdxReg; |
8560 | movq(op2, Address(x, len, Address::times_4, 0)); |
8561 | rorxq(op2, op2, 32); |
8562 | } |
8563 | else { |
8564 | movq(op2, Address(x, len, Address::times_4, 0)); |
8565 | rorq(op2, 32); |
8566 | } |
8567 | |
8568 | bind(L_third_loop); |
8569 | decrementl(len); |
8570 | jccb(Assembler::negative, L_third_loop_exit); |
8571 | decrementl(len); |
8572 | jccb(Assembler::negative, L_last_x); |
8573 | |
8574 | movq(op1, Address(x, len, Address::times_4, 0)); |
8575 | rorq(op1, 32); |
8576 | |
8577 | bind(L_multiply); |
8578 | subl(zlen, 2); |
8579 | movq(sum, Address(z, zlen, Address::times_4, 0)); |
8580 | |
8581 | // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. |
8582 | if (UseBMI2Instructions) { |
8583 | multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); |
8584 | } |
8585 | else { |
8586 | multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
8587 | } |
8588 | |
8589 | movq(Address(z, zlen, Address::times_4, 0), sum); |
8590 | |
8591 | jmp(L_third_loop); |
8592 | bind(L_third_loop_exit); |
8593 | |
8594 | // Fourth loop |
8595 | // Add 64 bit long carry into z with carry propogation. |
8596 | // Uses offsetted zlen. |
8597 | add_one_64(z, zlen, carry, tmp1); |
8598 | |
8599 | pop(len); |
8600 | pop(zlen); |
8601 | jmp(L_second_loop); |
8602 | |
8603 | // Next infrequent code is moved outside loops. |
8604 | bind(L_last_x); |
8605 | movl(op1, Address(x, 0)); |
8606 | jmp(L_multiply); |
8607 | |
8608 | bind(L_second_loop_exit); |
8609 | pop(len); |
8610 | pop(zlen); |
8611 | pop(len); |
8612 | pop(zlen); |
8613 | |
8614 | // Fifth loop |
8615 | // Shift z left 1 bit. |
8616 | lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); |
8617 | |
8618 | // z[zlen-1] |= x[len-1] & 1; |
8619 | movl(tmp3, Address(x, len, Address::times_4, -4)); |
8620 | andl(tmp3, 1); |
8621 | orl(Address(z, zlen, Address::times_4, -4), tmp3); |
8622 | |
8623 | pop(tmp5); |
8624 | pop(tmp4); |
8625 | pop(tmp3); |
8626 | pop(tmp2); |
8627 | pop(tmp1); |
8628 | } |
8629 | |
8630 | /** |
8631 | * Helper function for mul_add() |
8632 | * Multiply the in[] by int k and add to out[] starting at offset offs using |
8633 | * 128 bit by 32 bit multiply and return the carry in tmp5. |
8634 | * Only quad int aligned length of in[] is operated on in this function. |
8635 | * k is in rdxReg for BMI2Instructions, for others it is in tmp2. |
8636 | * This function preserves out, in and k registers. |
8637 | * len and offset point to the appropriate index in "in" & "out" correspondingly |
8638 | * tmp5 has the carry. |
8639 | * other registers are temporary and are modified. |
8640 | * |
8641 | */ |
8642 | void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, |
8643 | Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, |
8644 | Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
8645 | |
8646 | Label L_first_loop, L_first_loop_exit; |
8647 | |
8648 | movl(tmp1, len); |
8649 | shrl(tmp1, 2); |
8650 | |
8651 | bind(L_first_loop); |
8652 | subl(tmp1, 1); |
8653 | jccb(Assembler::negative, L_first_loop_exit); |
8654 | |
8655 | subl(len, 4); |
8656 | subl(offset, 4); |
8657 | |
8658 | Register op2 = tmp2; |
8659 | const Register sum = tmp3; |
8660 | const Register op1 = tmp4; |
8661 | const Register carry = tmp5; |
8662 | |
8663 | if (UseBMI2Instructions) { |
8664 | op2 = rdxReg; |
8665 | } |
8666 | |
8667 | movq(op1, Address(in, len, Address::times_4, 8)); |
8668 | rorq(op1, 32); |
8669 | movq(sum, Address(out, offset, Address::times_4, 8)); |
8670 | rorq(sum, 32); |
8671 | if (UseBMI2Instructions) { |
8672 | multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
8673 | } |
8674 | else { |
8675 | multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
8676 | } |
8677 | // Store back in big endian from little endian |
8678 | rorq(sum, 0x20); |
8679 | movq(Address(out, offset, Address::times_4, 8), sum); |
8680 | |
8681 | movq(op1, Address(in, len, Address::times_4, 0)); |
8682 | rorq(op1, 32); |
8683 | movq(sum, Address(out, offset, Address::times_4, 0)); |
8684 | rorq(sum, 32); |
8685 | if (UseBMI2Instructions) { |
8686 | multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
8687 | } |
8688 | else { |
8689 | multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
8690 | } |
8691 | // Store back in big endian from little endian |
8692 | rorq(sum, 0x20); |
8693 | movq(Address(out, offset, Address::times_4, 0), sum); |
8694 | |
8695 | jmp(L_first_loop); |
8696 | bind(L_first_loop_exit); |
8697 | } |
8698 | |
8699 | /** |
8700 | * Code for BigInteger::mulAdd() intrinsic |
8701 | * |
8702 | * rdi: out |
8703 | * rsi: in |
8704 | * r11: offs (out.length - offset) |
8705 | * rcx: len |
8706 | * r8: k |
8707 | * r12: tmp1 |
8708 | * r13: tmp2 |
8709 | * r14: tmp3 |
8710 | * r15: tmp4 |
8711 | * rbx: tmp5 |
8712 | * Multiply the in[] by word k and add to out[], return the carry in rax |
8713 | */ |
8714 | void MacroAssembler::mul_add(Register out, Register in, Register offs, |
8715 | Register len, Register k, Register tmp1, Register tmp2, Register tmp3, |
8716 | Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
8717 | |
8718 | Label L_carry, L_last_in, L_done; |
8719 | |
8720 | // carry = 0; |
8721 | // for (int j=len-1; j >= 0; j--) { |
8722 | // long product = (in[j] & LONG_MASK) * kLong + |
8723 | // (out[offs] & LONG_MASK) + carry; |
8724 | // out[offs--] = (int)product; |
8725 | // carry = product >>> 32; |
8726 | // } |
8727 | // |
8728 | push(tmp1); |
8729 | push(tmp2); |
8730 | push(tmp3); |
8731 | push(tmp4); |
8732 | push(tmp5); |
8733 | |
8734 | Register op2 = tmp2; |
8735 | const Register sum = tmp3; |
8736 | const Register op1 = tmp4; |
8737 | const Register carry = tmp5; |
8738 | |
8739 | if (UseBMI2Instructions) { |
8740 | op2 = rdxReg; |
8741 | movl(op2, k); |
8742 | } |
8743 | else { |
8744 | movl(op2, k); |
8745 | } |
8746 | |
8747 | xorq(carry, carry); |
8748 | |
8749 | //First loop |
8750 | |
8751 | //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply |
8752 | //The carry is in tmp5 |
8753 | mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); |
8754 | |
8755 | //Multiply the trailing in[] entry using 64 bit by 32 bit, if any |
8756 | decrementl(len); |
8757 | jccb(Assembler::negative, L_carry); |
8758 | decrementl(len); |
8759 | jccb(Assembler::negative, L_last_in); |
8760 | |
8761 | movq(op1, Address(in, len, Address::times_4, 0)); |
8762 | rorq(op1, 32); |
8763 | |
8764 | subl(offs, 2); |
8765 | movq(sum, Address(out, offs, Address::times_4, 0)); |
8766 | rorq(sum, 32); |
8767 | |
8768 | if (UseBMI2Instructions) { |
8769 | multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
8770 | } |
8771 | else { |
8772 | multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
8773 | } |
8774 | |
8775 | // Store back in big endian from little endian |
8776 | rorq(sum, 0x20); |
8777 | movq(Address(out, offs, Address::times_4, 0), sum); |
8778 | |
8779 | testl(len, len); |
8780 | jccb(Assembler::zero, L_carry); |
8781 | |
8782 | //Multiply the last in[] entry, if any |
8783 | bind(L_last_in); |
8784 | movl(op1, Address(in, 0)); |
8785 | movl(sum, Address(out, offs, Address::times_4, -4)); |
8786 | |
8787 | movl(raxReg, k); |
8788 | mull(op1); //tmp4 * eax -> edx:eax |
8789 | addl(sum, carry); |
8790 | adcl(rdxReg, 0); |
8791 | addl(sum, raxReg); |
8792 | adcl(rdxReg, 0); |
8793 | movl(carry, rdxReg); |
8794 | |
8795 | movl(Address(out, offs, Address::times_4, -4), sum); |
8796 | |
8797 | bind(L_carry); |
8798 | //return tmp5/carry as carry in rax |
8799 | movl(rax, carry); |
8800 | |
8801 | bind(L_done); |
8802 | pop(tmp5); |
8803 | pop(tmp4); |
8804 | pop(tmp3); |
8805 | pop(tmp2); |
8806 | pop(tmp1); |
8807 | } |
8808 | #endif |
8809 | |
8810 | /** |
8811 | * Emits code to update CRC-32 with a byte value according to constants in table |
8812 | * |
8813 | * @param [in,out]crc Register containing the crc. |
8814 | * @param [in]val Register containing the byte to fold into the CRC. |
8815 | * @param [in]table Register containing the table of crc constants. |
8816 | * |
8817 | * uint32_t crc; |
8818 | * val = crc_table[(val ^ crc) & 0xFF]; |
8819 | * crc = val ^ (crc >> 8); |
8820 | * |
8821 | */ |
8822 | void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { |
8823 | xorl(val, crc); |
8824 | andl(val, 0xFF); |
8825 | shrl(crc, 8); // unsigned shift |
8826 | xorl(crc, Address(table, val, Address::times_4, 0)); |
8827 | } |
8828 | |
8829 | /** |
8830 | * Fold four 128-bit data chunks |
8831 | */ |
8832 | void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { |
8833 | evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64] |
8834 | evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0] |
8835 | evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */); |
8836 | evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */); |
8837 | } |
8838 | |
8839 | /** |
8840 | * Fold 128-bit data chunk |
8841 | */ |
8842 | void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { |
8843 | if (UseAVX > 0) { |
8844 | vpclmulhdq(xtmp, xK, xcrc); // [123:64] |
8845 | vpclmulldq(xcrc, xK, xcrc); // [63:0] |
8846 | vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); |
8847 | pxor(xcrc, xtmp); |
8848 | } else { |
8849 | movdqa(xtmp, xcrc); |
8850 | pclmulhdq(xtmp, xK); // [123:64] |
8851 | pclmulldq(xcrc, xK); // [63:0] |
8852 | pxor(xcrc, xtmp); |
8853 | movdqu(xtmp, Address(buf, offset)); |
8854 | pxor(xcrc, xtmp); |
8855 | } |
8856 | } |
8857 | |
8858 | void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { |
8859 | if (UseAVX > 0) { |
8860 | vpclmulhdq(xtmp, xK, xcrc); |
8861 | vpclmulldq(xcrc, xK, xcrc); |
8862 | pxor(xcrc, xbuf); |
8863 | pxor(xcrc, xtmp); |
8864 | } else { |
8865 | movdqa(xtmp, xcrc); |
8866 | pclmulhdq(xtmp, xK); |
8867 | pclmulldq(xcrc, xK); |
8868 | pxor(xcrc, xbuf); |
8869 | pxor(xcrc, xtmp); |
8870 | } |
8871 | } |
8872 | |
8873 | /** |
8874 | * 8-bit folds to compute 32-bit CRC |
8875 | * |
8876 | * uint64_t xcrc; |
8877 | * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); |
8878 | */ |
8879 | void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { |
8880 | movdl(tmp, xcrc); |
8881 | andl(tmp, 0xFF); |
8882 | movdl(xtmp, Address(table, tmp, Address::times_4, 0)); |
8883 | psrldq(xcrc, 1); // unsigned shift one byte |
8884 | pxor(xcrc, xtmp); |
8885 | } |
8886 | |
8887 | /** |
8888 | * uint32_t crc; |
8889 | * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); |
8890 | */ |
8891 | void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { |
8892 | movl(tmp, crc); |
8893 | andl(tmp, 0xFF); |
8894 | shrl(crc, 8); |
8895 | xorl(crc, Address(table, tmp, Address::times_4, 0)); |
8896 | } |
8897 | |
8898 | /** |
8899 | * @param crc register containing existing CRC (32-bit) |
8900 | * @param buf register pointing to input byte buffer (byte*) |
8901 | * @param len register containing number of bytes |
8902 | * @param table register that will contain address of CRC table |
8903 | * @param tmp scratch register |
8904 | */ |
8905 | void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { |
8906 | assert_different_registers(crc, buf, len, table, tmp, rax); |
8907 | |
8908 | Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; |
8909 | Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; |
8910 | |
8911 | // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
8912 | // context for the registers used, where all instructions below are using 128-bit mode |
8913 | // On EVEX without VL and BW, these instructions will all be AVX. |
8914 | lea(table, ExternalAddress(StubRoutines::crc_table_addr())); |
8915 | notl(crc); // ~crc |
8916 | cmpl(len, 16); |
8917 | jcc(Assembler::less, L_tail); |
8918 | |
8919 | // Align buffer to 16 bytes |
8920 | movl(tmp, buf); |
8921 | andl(tmp, 0xF); |
8922 | jccb(Assembler::zero, L_aligned); |
8923 | subl(tmp, 16); |
8924 | addl(len, tmp); |
8925 | |
8926 | align(4); |
8927 | BIND(L_align_loop); |
8928 | movsbl(rax, Address(buf, 0)); // load byte with sign extension |
8929 | update_byte_crc32(crc, rax, table); |
8930 | increment(buf); |
8931 | incrementl(tmp); |
8932 | jccb(Assembler::less, L_align_loop); |
8933 | |
8934 | BIND(L_aligned); |
8935 | movl(tmp, len); // save |
8936 | shrl(len, 4); |
8937 | jcc(Assembler::zero, L_tail_restore); |
8938 | |
8939 | // Fold total 512 bits of polynomial on each iteration |
8940 | if (VM_Version::supports_vpclmulqdq()) { |
8941 | Label Parallel_loop, L_No_Parallel; |
8942 | |
8943 | cmpl(len, 8); |
8944 | jccb(Assembler::less, L_No_Parallel); |
8945 | |
8946 | movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); |
8947 | evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit); |
8948 | movdl(xmm5, crc); |
8949 | evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit); |
8950 | addptr(buf, 64); |
8951 | subl(len, 7); |
8952 | evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits |
8953 | |
8954 | BIND(Parallel_loop); |
8955 | fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0); |
8956 | addptr(buf, 64); |
8957 | subl(len, 4); |
8958 | jcc(Assembler::greater, Parallel_loop); |
8959 | |
8960 | vextracti64x2(xmm2, xmm1, 0x01); |
8961 | vextracti64x2(xmm3, xmm1, 0x02); |
8962 | vextracti64x2(xmm4, xmm1, 0x03); |
8963 | jmp(L_fold_512b); |
8964 | |
8965 | BIND(L_No_Parallel); |
8966 | } |
8967 | // Fold crc into first bytes of vector |
8968 | movdqa(xmm1, Address(buf, 0)); |
8969 | movdl(rax, xmm1); |
8970 | xorl(crc, rax); |
8971 | if (VM_Version::supports_sse4_1()) { |
8972 | pinsrd(xmm1, crc, 0); |
8973 | } else { |
8974 | pinsrw(xmm1, crc, 0); |
8975 | shrl(crc, 16); |
8976 | pinsrw(xmm1, crc, 1); |
8977 | } |
8978 | addptr(buf, 16); |
8979 | subl(len, 4); // len > 0 |
8980 | jcc(Assembler::less, L_fold_tail); |
8981 | |
8982 | movdqa(xmm2, Address(buf, 0)); |
8983 | movdqa(xmm3, Address(buf, 16)); |
8984 | movdqa(xmm4, Address(buf, 32)); |
8985 | addptr(buf, 48); |
8986 | subl(len, 3); |
8987 | jcc(Assembler::lessEqual, L_fold_512b); |
8988 | |
8989 | // Fold total 512 bits of polynomial on each iteration, |
8990 | // 128 bits per each of 4 parallel streams. |
8991 | movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); |
8992 | |
8993 | align(32); |
8994 | BIND(L_fold_512b_loop); |
8995 | fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); |
8996 | fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); |
8997 | fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); |
8998 | fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); |
8999 | addptr(buf, 64); |
9000 | subl(len, 4); |
9001 | jcc(Assembler::greater, L_fold_512b_loop); |
9002 | |
9003 | // Fold 512 bits to 128 bits. |
9004 | BIND(L_fold_512b); |
9005 | movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); |
9006 | fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); |
9007 | fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); |
9008 | fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); |
9009 | |
9010 | // Fold the rest of 128 bits data chunks |
9011 | BIND(L_fold_tail); |
9012 | addl(len, 3); |
9013 | jccb(Assembler::lessEqual, L_fold_128b); |
9014 | movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); |
9015 | |
9016 | BIND(L_fold_tail_loop); |
9017 | fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); |
9018 | addptr(buf, 16); |
9019 | decrementl(len); |
9020 | jccb(Assembler::greater, L_fold_tail_loop); |
9021 | |
9022 | // Fold 128 bits in xmm1 down into 32 bits in crc register. |
9023 | BIND(L_fold_128b); |
9024 | movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); |
9025 | if (UseAVX > 0) { |
9026 | vpclmulqdq(xmm2, xmm0, xmm1, 0x1); |
9027 | vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); |
9028 | vpclmulqdq(xmm0, xmm0, xmm3, 0x1); |
9029 | } else { |
9030 | movdqa(xmm2, xmm0); |
9031 | pclmulqdq(xmm2, xmm1, 0x1); |
9032 | movdqa(xmm3, xmm0); |
9033 | pand(xmm3, xmm2); |
9034 | pclmulqdq(xmm0, xmm3, 0x1); |
9035 | } |
9036 | psrldq(xmm1, 8); |
9037 | psrldq(xmm2, 4); |
9038 | pxor(xmm0, xmm1); |
9039 | pxor(xmm0, xmm2); |
9040 | |
9041 | // 8 8-bit folds to compute 32-bit CRC. |
9042 | for (int j = 0; j < 4; j++) { |
9043 | fold_8bit_crc32(xmm0, table, xmm1, rax); |
9044 | } |
9045 | movdl(crc, xmm0); // mov 32 bits to general register |
9046 | for (int j = 0; j < 4; j++) { |
9047 | fold_8bit_crc32(crc, table, rax); |
9048 | } |
9049 | |
9050 | BIND(L_tail_restore); |
9051 | movl(len, tmp); // restore |
9052 | BIND(L_tail); |
9053 | andl(len, 0xf); |
9054 | jccb(Assembler::zero, L_exit); |
9055 | |
9056 | // Fold the rest of bytes |
9057 | align(4); |
9058 | BIND(L_tail_loop); |
9059 | movsbl(rax, Address(buf, 0)); // load byte with sign extension |
9060 | update_byte_crc32(crc, rax, table); |
9061 | increment(buf); |
9062 | decrementl(len); |
9063 | jccb(Assembler::greater, L_tail_loop); |
9064 | |
9065 | BIND(L_exit); |
9066 | notl(crc); // ~c |
9067 | } |
9068 | |
9069 | #ifdef _LP64 |
9070 | // S. Gueron / Information Processing Letters 112 (2012) 184 |
9071 | // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. |
9072 | // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. |
9073 | // Output: the 64-bit carry-less product of B * CONST |
9074 | void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, |
9075 | Register tmp1, Register tmp2, Register tmp3) { |
9076 | lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); |
9077 | if (n > 0) { |
9078 | addq(tmp3, n * 256 * 8); |
9079 | } |
9080 | // Q1 = TABLEExt[n][B & 0xFF]; |
9081 | movl(tmp1, in); |
9082 | andl(tmp1, 0x000000FF); |
9083 | shll(tmp1, 3); |
9084 | addq(tmp1, tmp3); |
9085 | movq(tmp1, Address(tmp1, 0)); |
9086 | |
9087 | // Q2 = TABLEExt[n][B >> 8 & 0xFF]; |
9088 | movl(tmp2, in); |
9089 | shrl(tmp2, 8); |
9090 | andl(tmp2, 0x000000FF); |
9091 | shll(tmp2, 3); |
9092 | addq(tmp2, tmp3); |
9093 | movq(tmp2, Address(tmp2, 0)); |
9094 | |
9095 | shlq(tmp2, 8); |
9096 | xorq(tmp1, tmp2); |
9097 | |
9098 | // Q3 = TABLEExt[n][B >> 16 & 0xFF]; |
9099 | movl(tmp2, in); |
9100 | shrl(tmp2, 16); |
9101 | andl(tmp2, 0x000000FF); |
9102 | shll(tmp2, 3); |
9103 | addq(tmp2, tmp3); |
9104 | movq(tmp2, Address(tmp2, 0)); |
9105 | |
9106 | shlq(tmp2, 16); |
9107 | xorq(tmp1, tmp2); |
9108 | |
9109 | // Q4 = TABLEExt[n][B >> 24 & 0xFF]; |
9110 | shrl(in, 24); |
9111 | andl(in, 0x000000FF); |
9112 | shll(in, 3); |
9113 | addq(in, tmp3); |
9114 | movq(in, Address(in, 0)); |
9115 | |
9116 | shlq(in, 24); |
9117 | xorq(in, tmp1); |
9118 | // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; |
9119 | } |
9120 | |
9121 | void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, |
9122 | Register in_out, |
9123 | uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, |
9124 | XMMRegister w_xtmp2, |
9125 | Register tmp1, |
9126 | Register n_tmp2, Register n_tmp3) { |
9127 | if (is_pclmulqdq_supported) { |
9128 | movdl(w_xtmp1, in_out); // modified blindly |
9129 | |
9130 | movl(tmp1, const_or_pre_comp_const_index); |
9131 | movdl(w_xtmp2, tmp1); |
9132 | pclmulqdq(w_xtmp1, w_xtmp2, 0); |
9133 | |
9134 | movdq(in_out, w_xtmp1); |
9135 | } else { |
9136 | crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); |
9137 | } |
9138 | } |
9139 | |
9140 | // Recombination Alternative 2: No bit-reflections |
9141 | // T1 = (CRC_A * U1) << 1 |
9142 | // T2 = (CRC_B * U2) << 1 |
9143 | // C1 = T1 >> 32 |
9144 | // C2 = T2 >> 32 |
9145 | // T1 = T1 & 0xFFFFFFFF |
9146 | // T2 = T2 & 0xFFFFFFFF |
9147 | // T1 = CRC32(0, T1) |
9148 | // T2 = CRC32(0, T2) |
9149 | // C1 = C1 ^ T1 |
9150 | // C2 = C2 ^ T2 |
9151 | // CRC = C1 ^ C2 ^ CRC_C |
9152 | void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, |
9153 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9154 | Register tmp1, Register tmp2, |
9155 | Register n_tmp3) { |
9156 | crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); |
9157 | crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); |
9158 | shlq(in_out, 1); |
9159 | movl(tmp1, in_out); |
9160 | shrq(in_out, 32); |
9161 | xorl(tmp2, tmp2); |
9162 | crc32(tmp2, tmp1, 4); |
9163 | xorl(in_out, tmp2); // we don't care about upper 32 bit contents here |
9164 | shlq(in1, 1); |
9165 | movl(tmp1, in1); |
9166 | shrq(in1, 32); |
9167 | xorl(tmp2, tmp2); |
9168 | crc32(tmp2, tmp1, 4); |
9169 | xorl(in1, tmp2); |
9170 | xorl(in_out, in1); |
9171 | xorl(in_out, in2); |
9172 | } |
9173 | |
9174 | // Set N to predefined value |
9175 | // Subtract from a lenght of a buffer |
9176 | // execute in a loop: |
9177 | // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 |
9178 | // for i = 1 to N do |
9179 | // CRC_A = CRC32(CRC_A, A[i]) |
9180 | // CRC_B = CRC32(CRC_B, B[i]) |
9181 | // CRC_C = CRC32(CRC_C, C[i]) |
9182 | // end for |
9183 | // Recombine |
9184 | void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, |
9185 | Register in_out1, Register in_out2, Register in_out3, |
9186 | Register tmp1, Register tmp2, Register tmp3, |
9187 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9188 | Register tmp4, Register tmp5, |
9189 | Register n_tmp6) { |
9190 | Label L_processPartitions; |
9191 | Label L_processPartition; |
9192 | Label L_exit; |
9193 | |
9194 | bind(L_processPartitions); |
9195 | cmpl(in_out1, 3 * size); |
9196 | jcc(Assembler::less, L_exit); |
9197 | xorl(tmp1, tmp1); |
9198 | xorl(tmp2, tmp2); |
9199 | movq(tmp3, in_out2); |
9200 | addq(tmp3, size); |
9201 | |
9202 | bind(L_processPartition); |
9203 | crc32(in_out3, Address(in_out2, 0), 8); |
9204 | crc32(tmp1, Address(in_out2, size), 8); |
9205 | crc32(tmp2, Address(in_out2, size * 2), 8); |
9206 | addq(in_out2, 8); |
9207 | cmpq(in_out2, tmp3); |
9208 | jcc(Assembler::less, L_processPartition); |
9209 | crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, |
9210 | w_xtmp1, w_xtmp2, w_xtmp3, |
9211 | tmp4, tmp5, |
9212 | n_tmp6); |
9213 | addq(in_out2, 2 * size); |
9214 | subl(in_out1, 3 * size); |
9215 | jmp(L_processPartitions); |
9216 | |
9217 | bind(L_exit); |
9218 | } |
9219 | #else |
9220 | void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, |
9221 | Register tmp1, Register tmp2, Register tmp3, |
9222 | XMMRegister xtmp1, XMMRegister xtmp2) { |
9223 | lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); |
9224 | if (n > 0) { |
9225 | addl(tmp3, n * 256 * 8); |
9226 | } |
9227 | // Q1 = TABLEExt[n][B & 0xFF]; |
9228 | movl(tmp1, in_out); |
9229 | andl(tmp1, 0x000000FF); |
9230 | shll(tmp1, 3); |
9231 | addl(tmp1, tmp3); |
9232 | movq(xtmp1, Address(tmp1, 0)); |
9233 | |
9234 | // Q2 = TABLEExt[n][B >> 8 & 0xFF]; |
9235 | movl(tmp2, in_out); |
9236 | shrl(tmp2, 8); |
9237 | andl(tmp2, 0x000000FF); |
9238 | shll(tmp2, 3); |
9239 | addl(tmp2, tmp3); |
9240 | movq(xtmp2, Address(tmp2, 0)); |
9241 | |
9242 | psllq(xtmp2, 8); |
9243 | pxor(xtmp1, xtmp2); |
9244 | |
9245 | // Q3 = TABLEExt[n][B >> 16 & 0xFF]; |
9246 | movl(tmp2, in_out); |
9247 | shrl(tmp2, 16); |
9248 | andl(tmp2, 0x000000FF); |
9249 | shll(tmp2, 3); |
9250 | addl(tmp2, tmp3); |
9251 | movq(xtmp2, Address(tmp2, 0)); |
9252 | |
9253 | psllq(xtmp2, 16); |
9254 | pxor(xtmp1, xtmp2); |
9255 | |
9256 | // Q4 = TABLEExt[n][B >> 24 & 0xFF]; |
9257 | shrl(in_out, 24); |
9258 | andl(in_out, 0x000000FF); |
9259 | shll(in_out, 3); |
9260 | addl(in_out, tmp3); |
9261 | movq(xtmp2, Address(in_out, 0)); |
9262 | |
9263 | psllq(xtmp2, 24); |
9264 | pxor(xtmp1, xtmp2); // Result in CXMM |
9265 | // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; |
9266 | } |
9267 | |
9268 | void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, |
9269 | Register in_out, |
9270 | uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, |
9271 | XMMRegister w_xtmp2, |
9272 | Register tmp1, |
9273 | Register n_tmp2, Register n_tmp3) { |
9274 | if (is_pclmulqdq_supported) { |
9275 | movdl(w_xtmp1, in_out); |
9276 | |
9277 | movl(tmp1, const_or_pre_comp_const_index); |
9278 | movdl(w_xtmp2, tmp1); |
9279 | pclmulqdq(w_xtmp1, w_xtmp2, 0); |
9280 | // Keep result in XMM since GPR is 32 bit in length |
9281 | } else { |
9282 | crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); |
9283 | } |
9284 | } |
9285 | |
9286 | void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, |
9287 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9288 | Register tmp1, Register tmp2, |
9289 | Register n_tmp3) { |
9290 | crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); |
9291 | crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); |
9292 | |
9293 | psllq(w_xtmp1, 1); |
9294 | movdl(tmp1, w_xtmp1); |
9295 | psrlq(w_xtmp1, 32); |
9296 | movdl(in_out, w_xtmp1); |
9297 | |
9298 | xorl(tmp2, tmp2); |
9299 | crc32(tmp2, tmp1, 4); |
9300 | xorl(in_out, tmp2); |
9301 | |
9302 | psllq(w_xtmp2, 1); |
9303 | movdl(tmp1, w_xtmp2); |
9304 | psrlq(w_xtmp2, 32); |
9305 | movdl(in1, w_xtmp2); |
9306 | |
9307 | xorl(tmp2, tmp2); |
9308 | crc32(tmp2, tmp1, 4); |
9309 | xorl(in1, tmp2); |
9310 | xorl(in_out, in1); |
9311 | xorl(in_out, in2); |
9312 | } |
9313 | |
9314 | void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, |
9315 | Register in_out1, Register in_out2, Register in_out3, |
9316 | Register tmp1, Register tmp2, Register tmp3, |
9317 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9318 | Register tmp4, Register tmp5, |
9319 | Register n_tmp6) { |
9320 | Label L_processPartitions; |
9321 | Label L_processPartition; |
9322 | Label L_exit; |
9323 | |
9324 | bind(L_processPartitions); |
9325 | cmpl(in_out1, 3 * size); |
9326 | jcc(Assembler::less, L_exit); |
9327 | xorl(tmp1, tmp1); |
9328 | xorl(tmp2, tmp2); |
9329 | movl(tmp3, in_out2); |
9330 | addl(tmp3, size); |
9331 | |
9332 | bind(L_processPartition); |
9333 | crc32(in_out3, Address(in_out2, 0), 4); |
9334 | crc32(tmp1, Address(in_out2, size), 4); |
9335 | crc32(tmp2, Address(in_out2, size*2), 4); |
9336 | crc32(in_out3, Address(in_out2, 0+4), 4); |
9337 | crc32(tmp1, Address(in_out2, size+4), 4); |
9338 | crc32(tmp2, Address(in_out2, size*2+4), 4); |
9339 | addl(in_out2, 8); |
9340 | cmpl(in_out2, tmp3); |
9341 | jcc(Assembler::less, L_processPartition); |
9342 | |
9343 | push(tmp3); |
9344 | push(in_out1); |
9345 | push(in_out2); |
9346 | tmp4 = tmp3; |
9347 | tmp5 = in_out1; |
9348 | n_tmp6 = in_out2; |
9349 | |
9350 | crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, |
9351 | w_xtmp1, w_xtmp2, w_xtmp3, |
9352 | tmp4, tmp5, |
9353 | n_tmp6); |
9354 | |
9355 | pop(in_out2); |
9356 | pop(in_out1); |
9357 | pop(tmp3); |
9358 | |
9359 | addl(in_out2, 2 * size); |
9360 | subl(in_out1, 3 * size); |
9361 | jmp(L_processPartitions); |
9362 | |
9363 | bind(L_exit); |
9364 | } |
9365 | #endif //LP64 |
9366 | |
9367 | #ifdef _LP64 |
9368 | // Algorithm 2: Pipelined usage of the CRC32 instruction. |
9369 | // Input: A buffer I of L bytes. |
9370 | // Output: the CRC32C value of the buffer. |
9371 | // Notations: |
9372 | // Write L = 24N + r, with N = floor (L/24). |
9373 | // r = L mod 24 (0 <= r < 24). |
9374 | // Consider I as the concatenation of A|B|C|R, where A, B, C, each, |
9375 | // N quadwords, and R consists of r bytes. |
9376 | // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 |
9377 | // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 |
9378 | // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 |
9379 | // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 |
9380 | void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, |
9381 | Register tmp1, Register tmp2, Register tmp3, |
9382 | Register tmp4, Register tmp5, Register tmp6, |
9383 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9384 | bool is_pclmulqdq_supported) { |
9385 | uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; |
9386 | Label L_wordByWord; |
9387 | Label L_byteByByteProlog; |
9388 | Label L_byteByByte; |
9389 | Label L_exit; |
9390 | |
9391 | if (is_pclmulqdq_supported ) { |
9392 | const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; |
9393 | const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); |
9394 | |
9395 | const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); |
9396 | const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); |
9397 | |
9398 | const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); |
9399 | const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); |
9400 | assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"" ); |
9401 | } else { |
9402 | const_or_pre_comp_const_index[0] = 1; |
9403 | const_or_pre_comp_const_index[1] = 0; |
9404 | |
9405 | const_or_pre_comp_const_index[2] = 3; |
9406 | const_or_pre_comp_const_index[3] = 2; |
9407 | |
9408 | const_or_pre_comp_const_index[4] = 5; |
9409 | const_or_pre_comp_const_index[5] = 4; |
9410 | } |
9411 | crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, |
9412 | in2, in1, in_out, |
9413 | tmp1, tmp2, tmp3, |
9414 | w_xtmp1, w_xtmp2, w_xtmp3, |
9415 | tmp4, tmp5, |
9416 | tmp6); |
9417 | crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, |
9418 | in2, in1, in_out, |
9419 | tmp1, tmp2, tmp3, |
9420 | w_xtmp1, w_xtmp2, w_xtmp3, |
9421 | tmp4, tmp5, |
9422 | tmp6); |
9423 | crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, |
9424 | in2, in1, in_out, |
9425 | tmp1, tmp2, tmp3, |
9426 | w_xtmp1, w_xtmp2, w_xtmp3, |
9427 | tmp4, tmp5, |
9428 | tmp6); |
9429 | movl(tmp1, in2); |
9430 | andl(tmp1, 0x00000007); |
9431 | negl(tmp1); |
9432 | addl(tmp1, in2); |
9433 | addq(tmp1, in1); |
9434 | |
9435 | BIND(L_wordByWord); |
9436 | cmpq(in1, tmp1); |
9437 | jcc(Assembler::greaterEqual, L_byteByByteProlog); |
9438 | crc32(in_out, Address(in1, 0), 4); |
9439 | addq(in1, 4); |
9440 | jmp(L_wordByWord); |
9441 | |
9442 | BIND(L_byteByByteProlog); |
9443 | andl(in2, 0x00000007); |
9444 | movl(tmp2, 1); |
9445 | |
9446 | BIND(L_byteByByte); |
9447 | cmpl(tmp2, in2); |
9448 | jccb(Assembler::greater, L_exit); |
9449 | crc32(in_out, Address(in1, 0), 1); |
9450 | incq(in1); |
9451 | incl(tmp2); |
9452 | jmp(L_byteByByte); |
9453 | |
9454 | BIND(L_exit); |
9455 | } |
9456 | #else |
9457 | void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, |
9458 | Register tmp1, Register tmp2, Register tmp3, |
9459 | Register tmp4, Register tmp5, Register tmp6, |
9460 | XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, |
9461 | bool is_pclmulqdq_supported) { |
9462 | uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; |
9463 | Label L_wordByWord; |
9464 | Label L_byteByByteProlog; |
9465 | Label L_byteByByte; |
9466 | Label L_exit; |
9467 | |
9468 | if (is_pclmulqdq_supported) { |
9469 | const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; |
9470 | const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); |
9471 | |
9472 | const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); |
9473 | const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); |
9474 | |
9475 | const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); |
9476 | const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); |
9477 | } else { |
9478 | const_or_pre_comp_const_index[0] = 1; |
9479 | const_or_pre_comp_const_index[1] = 0; |
9480 | |
9481 | const_or_pre_comp_const_index[2] = 3; |
9482 | const_or_pre_comp_const_index[3] = 2; |
9483 | |
9484 | const_or_pre_comp_const_index[4] = 5; |
9485 | const_or_pre_comp_const_index[5] = 4; |
9486 | } |
9487 | crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, |
9488 | in2, in1, in_out, |
9489 | tmp1, tmp2, tmp3, |
9490 | w_xtmp1, w_xtmp2, w_xtmp3, |
9491 | tmp4, tmp5, |
9492 | tmp6); |
9493 | crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, |
9494 | in2, in1, in_out, |
9495 | tmp1, tmp2, tmp3, |
9496 | w_xtmp1, w_xtmp2, w_xtmp3, |
9497 | tmp4, tmp5, |
9498 | tmp6); |
9499 | crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, |
9500 | in2, in1, in_out, |
9501 | tmp1, tmp2, tmp3, |
9502 | w_xtmp1, w_xtmp2, w_xtmp3, |
9503 | tmp4, tmp5, |
9504 | tmp6); |
9505 | movl(tmp1, in2); |
9506 | andl(tmp1, 0x00000007); |
9507 | negl(tmp1); |
9508 | addl(tmp1, in2); |
9509 | addl(tmp1, in1); |
9510 | |
9511 | BIND(L_wordByWord); |
9512 | cmpl(in1, tmp1); |
9513 | jcc(Assembler::greaterEqual, L_byteByByteProlog); |
9514 | crc32(in_out, Address(in1,0), 4); |
9515 | addl(in1, 4); |
9516 | jmp(L_wordByWord); |
9517 | |
9518 | BIND(L_byteByByteProlog); |
9519 | andl(in2, 0x00000007); |
9520 | movl(tmp2, 1); |
9521 | |
9522 | BIND(L_byteByByte); |
9523 | cmpl(tmp2, in2); |
9524 | jccb(Assembler::greater, L_exit); |
9525 | movb(tmp1, Address(in1, 0)); |
9526 | crc32(in_out, tmp1, 1); |
9527 | incl(in1); |
9528 | incl(tmp2); |
9529 | jmp(L_byteByByte); |
9530 | |
9531 | BIND(L_exit); |
9532 | } |
9533 | #endif // LP64 |
9534 | #undef BIND |
9535 | #undef BLOCK_COMMENT |
9536 | |
9537 | // Compress char[] array to byte[]. |
9538 | // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java |
9539 | // @HotSpotIntrinsicCandidate |
9540 | // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { |
9541 | // for (int i = 0; i < len; i++) { |
9542 | // int c = src[srcOff++]; |
9543 | // if (c >>> 8 != 0) { |
9544 | // return 0; |
9545 | // } |
9546 | // dst[dstOff++] = (byte)c; |
9547 | // } |
9548 | // return len; |
9549 | // } |
9550 | void MacroAssembler::char_array_compress(Register src, Register dst, Register len, |
9551 | XMMRegister tmp1Reg, XMMRegister tmp2Reg, |
9552 | XMMRegister tmp3Reg, XMMRegister tmp4Reg, |
9553 | Register tmp5, Register result) { |
9554 | Label copy_chars_loop, return_length, return_zero, done; |
9555 | |
9556 | // rsi: src |
9557 | // rdi: dst |
9558 | // rdx: len |
9559 | // rcx: tmp5 |
9560 | // rax: result |
9561 | |
9562 | // rsi holds start addr of source char[] to be compressed |
9563 | // rdi holds start addr of destination byte[] |
9564 | // rdx holds length |
9565 | |
9566 | assert(len != result, "" ); |
9567 | |
9568 | // save length for return |
9569 | push(len); |
9570 | |
9571 | if ((UseAVX > 2) && // AVX512 |
9572 | VM_Version::supports_avx512vlbw() && |
9573 | VM_Version::supports_bmi2()) { |
9574 | |
9575 | Label copy_32_loop, copy_loop_tail, below_threshold; |
9576 | |
9577 | // alignment |
9578 | Label post_alignment; |
9579 | |
9580 | // if length of the string is less than 16, handle it in an old fashioned way |
9581 | testl(len, -32); |
9582 | jcc(Assembler::zero, below_threshold); |
9583 | |
9584 | // First check whether a character is compressable ( <= 0xFF). |
9585 | // Create mask to test for Unicode chars inside zmm vector |
9586 | movl(result, 0x00FF); |
9587 | evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); |
9588 | |
9589 | testl(len, -64); |
9590 | jcc(Assembler::zero, post_alignment); |
9591 | |
9592 | movl(tmp5, dst); |
9593 | andl(tmp5, (32 - 1)); |
9594 | negl(tmp5); |
9595 | andl(tmp5, (32 - 1)); |
9596 | |
9597 | // bail out when there is nothing to be done |
9598 | testl(tmp5, 0xFFFFFFFF); |
9599 | jcc(Assembler::zero, post_alignment); |
9600 | |
9601 | // ~(~0 << len), where len is the # of remaining elements to process |
9602 | movl(result, 0xFFFFFFFF); |
9603 | shlxl(result, result, tmp5); |
9604 | notl(result); |
9605 | kmovdl(k3, result); |
9606 | |
9607 | evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); |
9608 | evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); |
9609 | ktestd(k2, k3); |
9610 | jcc(Assembler::carryClear, return_zero); |
9611 | |
9612 | evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); |
9613 | |
9614 | addptr(src, tmp5); |
9615 | addptr(src, tmp5); |
9616 | addptr(dst, tmp5); |
9617 | subl(len, tmp5); |
9618 | |
9619 | bind(post_alignment); |
9620 | // end of alignment |
9621 | |
9622 | movl(tmp5, len); |
9623 | andl(tmp5, (32 - 1)); // tail count (in chars) |
9624 | andl(len, ~(32 - 1)); // vector count (in chars) |
9625 | jcc(Assembler::zero, copy_loop_tail); |
9626 | |
9627 | lea(src, Address(src, len, Address::times_2)); |
9628 | lea(dst, Address(dst, len, Address::times_1)); |
9629 | negptr(len); |
9630 | |
9631 | bind(copy_32_loop); |
9632 | evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); |
9633 | evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); |
9634 | kortestdl(k2, k2); |
9635 | jcc(Assembler::carryClear, return_zero); |
9636 | |
9637 | // All elements in current processed chunk are valid candidates for |
9638 | // compression. Write a truncated byte elements to the memory. |
9639 | evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); |
9640 | addptr(len, 32); |
9641 | jcc(Assembler::notZero, copy_32_loop); |
9642 | |
9643 | bind(copy_loop_tail); |
9644 | // bail out when there is nothing to be done |
9645 | testl(tmp5, 0xFFFFFFFF); |
9646 | jcc(Assembler::zero, return_length); |
9647 | |
9648 | movl(len, tmp5); |
9649 | |
9650 | // ~(~0 << len), where len is the # of remaining elements to process |
9651 | movl(result, 0xFFFFFFFF); |
9652 | shlxl(result, result, len); |
9653 | notl(result); |
9654 | |
9655 | kmovdl(k3, result); |
9656 | |
9657 | evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); |
9658 | evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); |
9659 | ktestd(k2, k3); |
9660 | jcc(Assembler::carryClear, return_zero); |
9661 | |
9662 | evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); |
9663 | jmp(return_length); |
9664 | |
9665 | bind(below_threshold); |
9666 | } |
9667 | |
9668 | if (UseSSE42Intrinsics) { |
9669 | Label copy_32_loop, copy_16, copy_tail; |
9670 | |
9671 | movl(result, len); |
9672 | |
9673 | movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors |
9674 | |
9675 | // vectored compression |
9676 | andl(len, 0xfffffff0); // vector count (in chars) |
9677 | andl(result, 0x0000000f); // tail count (in chars) |
9678 | testl(len, len); |
9679 | jcc(Assembler::zero, copy_16); |
9680 | |
9681 | // compress 16 chars per iter |
9682 | movdl(tmp1Reg, tmp5); |
9683 | pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg |
9684 | pxor(tmp4Reg, tmp4Reg); |
9685 | |
9686 | lea(src, Address(src, len, Address::times_2)); |
9687 | lea(dst, Address(dst, len, Address::times_1)); |
9688 | negptr(len); |
9689 | |
9690 | bind(copy_32_loop); |
9691 | movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters |
9692 | por(tmp4Reg, tmp2Reg); |
9693 | movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters |
9694 | por(tmp4Reg, tmp3Reg); |
9695 | ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector |
9696 | jcc(Assembler::notZero, return_zero); |
9697 | packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte |
9698 | movdqu(Address(dst, len, Address::times_1), tmp2Reg); |
9699 | addptr(len, 16); |
9700 | jcc(Assembler::notZero, copy_32_loop); |
9701 | |
9702 | // compress next vector of 8 chars (if any) |
9703 | bind(copy_16); |
9704 | movl(len, result); |
9705 | andl(len, 0xfffffff8); // vector count (in chars) |
9706 | andl(result, 0x00000007); // tail count (in chars) |
9707 | testl(len, len); |
9708 | jccb(Assembler::zero, copy_tail); |
9709 | |
9710 | movdl(tmp1Reg, tmp5); |
9711 | pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg |
9712 | pxor(tmp3Reg, tmp3Reg); |
9713 | |
9714 | movdqu(tmp2Reg, Address(src, 0)); |
9715 | ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector |
9716 | jccb(Assembler::notZero, return_zero); |
9717 | packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte |
9718 | movq(Address(dst, 0), tmp2Reg); |
9719 | addptr(src, 16); |
9720 | addptr(dst, 8); |
9721 | |
9722 | bind(copy_tail); |
9723 | movl(len, result); |
9724 | } |
9725 | // compress 1 char per iter |
9726 | testl(len, len); |
9727 | jccb(Assembler::zero, return_length); |
9728 | lea(src, Address(src, len, Address::times_2)); |
9729 | lea(dst, Address(dst, len, Address::times_1)); |
9730 | negptr(len); |
9731 | |
9732 | bind(copy_chars_loop); |
9733 | load_unsigned_short(result, Address(src, len, Address::times_2)); |
9734 | testl(result, 0xff00); // check if Unicode char |
9735 | jccb(Assembler::notZero, return_zero); |
9736 | movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte |
9737 | increment(len); |
9738 | jcc(Assembler::notZero, copy_chars_loop); |
9739 | |
9740 | // if compression succeeded, return length |
9741 | bind(return_length); |
9742 | pop(result); |
9743 | jmpb(done); |
9744 | |
9745 | // if compression failed, return 0 |
9746 | bind(return_zero); |
9747 | xorl(result, result); |
9748 | addptr(rsp, wordSize); |
9749 | |
9750 | bind(done); |
9751 | } |
9752 | |
9753 | // Inflate byte[] array to char[]. |
9754 | // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java |
9755 | // @HotSpotIntrinsicCandidate |
9756 | // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { |
9757 | // for (int i = 0; i < len; i++) { |
9758 | // dst[dstOff++] = (char)(src[srcOff++] & 0xff); |
9759 | // } |
9760 | // } |
9761 | void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, |
9762 | XMMRegister tmp1, Register tmp2) { |
9763 | Label copy_chars_loop, done, below_threshold; |
9764 | // rsi: src |
9765 | // rdi: dst |
9766 | // rdx: len |
9767 | // rcx: tmp2 |
9768 | |
9769 | // rsi holds start addr of source byte[] to be inflated |
9770 | // rdi holds start addr of destination char[] |
9771 | // rdx holds length |
9772 | assert_different_registers(src, dst, len, tmp2); |
9773 | |
9774 | if ((UseAVX > 2) && // AVX512 |
9775 | VM_Version::supports_avx512vlbw() && |
9776 | VM_Version::supports_bmi2()) { |
9777 | |
9778 | Label copy_32_loop, copy_tail; |
9779 | Register tmp3_aliased = len; |
9780 | |
9781 | // if length of the string is less than 16, handle it in an old fashioned way |
9782 | testl(len, -16); |
9783 | jcc(Assembler::zero, below_threshold); |
9784 | |
9785 | // In order to use only one arithmetic operation for the main loop we use |
9786 | // this pre-calculation |
9787 | movl(tmp2, len); |
9788 | andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop |
9789 | andl(len, -32); // vector count |
9790 | jccb(Assembler::zero, copy_tail); |
9791 | |
9792 | lea(src, Address(src, len, Address::times_1)); |
9793 | lea(dst, Address(dst, len, Address::times_2)); |
9794 | negptr(len); |
9795 | |
9796 | |
9797 | // inflate 32 chars per iter |
9798 | bind(copy_32_loop); |
9799 | vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); |
9800 | evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); |
9801 | addptr(len, 32); |
9802 | jcc(Assembler::notZero, copy_32_loop); |
9803 | |
9804 | bind(copy_tail); |
9805 | // bail out when there is nothing to be done |
9806 | testl(tmp2, -1); // we don't destroy the contents of tmp2 here |
9807 | jcc(Assembler::zero, done); |
9808 | |
9809 | // ~(~0 << length), where length is the # of remaining elements to process |
9810 | movl(tmp3_aliased, -1); |
9811 | shlxl(tmp3_aliased, tmp3_aliased, tmp2); |
9812 | notl(tmp3_aliased); |
9813 | kmovdl(k2, tmp3_aliased); |
9814 | evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); |
9815 | evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit); |
9816 | |
9817 | jmp(done); |
9818 | } |
9819 | if (UseSSE42Intrinsics) { |
9820 | Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; |
9821 | |
9822 | movl(tmp2, len); |
9823 | |
9824 | if (UseAVX > 1) { |
9825 | andl(tmp2, (16 - 1)); |
9826 | andl(len, -16); |
9827 | jccb(Assembler::zero, copy_new_tail); |
9828 | } else { |
9829 | andl(tmp2, 0x00000007); // tail count (in chars) |
9830 | andl(len, 0xfffffff8); // vector count (in chars) |
9831 | jccb(Assembler::zero, copy_tail); |
9832 | } |
9833 | |
9834 | // vectored inflation |
9835 | lea(src, Address(src, len, Address::times_1)); |
9836 | lea(dst, Address(dst, len, Address::times_2)); |
9837 | negptr(len); |
9838 | |
9839 | if (UseAVX > 1) { |
9840 | bind(copy_16_loop); |
9841 | vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); |
9842 | vmovdqu(Address(dst, len, Address::times_2), tmp1); |
9843 | addptr(len, 16); |
9844 | jcc(Assembler::notZero, copy_16_loop); |
9845 | |
9846 | bind(below_threshold); |
9847 | bind(copy_new_tail); |
9848 | if ((UseAVX > 2) && |
9849 | VM_Version::supports_avx512vlbw() && |
9850 | VM_Version::supports_bmi2()) { |
9851 | movl(tmp2, len); |
9852 | } else { |
9853 | movl(len, tmp2); |
9854 | } |
9855 | andl(tmp2, 0x00000007); |
9856 | andl(len, 0xFFFFFFF8); |
9857 | jccb(Assembler::zero, copy_tail); |
9858 | |
9859 | pmovzxbw(tmp1, Address(src, 0)); |
9860 | movdqu(Address(dst, 0), tmp1); |
9861 | addptr(src, 8); |
9862 | addptr(dst, 2 * 8); |
9863 | |
9864 | jmp(copy_tail, true); |
9865 | } |
9866 | |
9867 | // inflate 8 chars per iter |
9868 | bind(copy_8_loop); |
9869 | pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words |
9870 | movdqu(Address(dst, len, Address::times_2), tmp1); |
9871 | addptr(len, 8); |
9872 | jcc(Assembler::notZero, copy_8_loop); |
9873 | |
9874 | bind(copy_tail); |
9875 | movl(len, tmp2); |
9876 | |
9877 | cmpl(len, 4); |
9878 | jccb(Assembler::less, copy_bytes); |
9879 | |
9880 | movdl(tmp1, Address(src, 0)); // load 4 byte chars |
9881 | pmovzxbw(tmp1, tmp1); |
9882 | movq(Address(dst, 0), tmp1); |
9883 | subptr(len, 4); |
9884 | addptr(src, 4); |
9885 | addptr(dst, 8); |
9886 | |
9887 | bind(copy_bytes); |
9888 | } else { |
9889 | bind(below_threshold); |
9890 | } |
9891 | |
9892 | testl(len, len); |
9893 | jccb(Assembler::zero, done); |
9894 | lea(src, Address(src, len, Address::times_1)); |
9895 | lea(dst, Address(dst, len, Address::times_2)); |
9896 | negptr(len); |
9897 | |
9898 | // inflate 1 char per iter |
9899 | bind(copy_chars_loop); |
9900 | load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char |
9901 | movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word |
9902 | increment(len); |
9903 | jcc(Assembler::notZero, copy_chars_loop); |
9904 | |
9905 | bind(done); |
9906 | } |
9907 | |
9908 | Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { |
9909 | switch (cond) { |
9910 | // Note some conditions are synonyms for others |
9911 | case Assembler::zero: return Assembler::notZero; |
9912 | case Assembler::notZero: return Assembler::zero; |
9913 | case Assembler::less: return Assembler::greaterEqual; |
9914 | case Assembler::lessEqual: return Assembler::greater; |
9915 | case Assembler::greater: return Assembler::lessEqual; |
9916 | case Assembler::greaterEqual: return Assembler::less; |
9917 | case Assembler::below: return Assembler::aboveEqual; |
9918 | case Assembler::belowEqual: return Assembler::above; |
9919 | case Assembler::above: return Assembler::belowEqual; |
9920 | case Assembler::aboveEqual: return Assembler::below; |
9921 | case Assembler::overflow: return Assembler::noOverflow; |
9922 | case Assembler::noOverflow: return Assembler::overflow; |
9923 | case Assembler::negative: return Assembler::positive; |
9924 | case Assembler::positive: return Assembler::negative; |
9925 | case Assembler::parity: return Assembler::noParity; |
9926 | case Assembler::noParity: return Assembler::parity; |
9927 | } |
9928 | ShouldNotReachHere(); return Assembler::overflow; |
9929 | } |
9930 | |
9931 | SkipIfEqual::SkipIfEqual( |
9932 | MacroAssembler* masm, const bool* flag_addr, bool value) { |
9933 | _masm = masm; |
9934 | _masm->cmp8(ExternalAddress((address)flag_addr), value); |
9935 | _masm->jcc(Assembler::equal, _label); |
9936 | } |
9937 | |
9938 | SkipIfEqual::~SkipIfEqual() { |
9939 | _masm->bind(_label); |
9940 | } |
9941 | |
9942 | // 32-bit Windows has its own fast-path implementation |
9943 | // of get_thread |
9944 | #if !defined(WIN32) || defined(_LP64) |
9945 | |
9946 | // This is simply a call to Thread::current() |
9947 | void MacroAssembler::get_thread(Register thread) { |
9948 | if (thread != rax) { |
9949 | push(rax); |
9950 | } |
9951 | LP64_ONLY(push(rdi);) |
9952 | LP64_ONLY(push(rsi);) |
9953 | push(rdx); |
9954 | push(rcx); |
9955 | #ifdef _LP64 |
9956 | push(r8); |
9957 | push(r9); |
9958 | push(r10); |
9959 | push(r11); |
9960 | #endif |
9961 | |
9962 | MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); |
9963 | |
9964 | #ifdef _LP64 |
9965 | pop(r11); |
9966 | pop(r10); |
9967 | pop(r9); |
9968 | pop(r8); |
9969 | #endif |
9970 | pop(rcx); |
9971 | pop(rdx); |
9972 | LP64_ONLY(pop(rsi);) |
9973 | LP64_ONLY(pop(rdi);) |
9974 | if (thread != rax) { |
9975 | mov(thread, rax); |
9976 | pop(rax); |
9977 | } |
9978 | } |
9979 | |
9980 | #endif |
9981 | |