1/*
2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "jvm.h"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "compiler/disassembler.hpp"
30#include "gc/shared/barrierSet.hpp"
31#include "gc/shared/barrierSetAssembler.hpp"
32#include "gc/shared/collectedHeap.inline.hpp"
33#include "interpreter/interpreter.hpp"
34#include "memory/resourceArea.hpp"
35#include "memory/universe.hpp"
36#include "oops/accessDecorators.hpp"
37#include "oops/compressedOops.inline.hpp"
38#include "oops/klass.inline.hpp"
39#include "prims/methodHandles.hpp"
40#include "runtime/biasedLocking.hpp"
41#include "runtime/flags/flagSetting.hpp"
42#include "runtime/interfaceSupport.inline.hpp"
43#include "runtime/objectMonitor.hpp"
44#include "runtime/os.hpp"
45#include "runtime/safepoint.hpp"
46#include "runtime/safepointMechanism.hpp"
47#include "runtime/sharedRuntime.hpp"
48#include "runtime/stubRoutines.hpp"
49#include "runtime/thread.hpp"
50#include "utilities/macros.hpp"
51#include "crc32c.h"
52#ifdef COMPILER2
53#include "opto/intrinsicnode.hpp"
54#endif
55
56#ifdef PRODUCT
57#define BLOCK_COMMENT(str) /* nothing */
58#define STOP(error) stop(error)
59#else
60#define BLOCK_COMMENT(str) block_comment(str)
61#define STOP(error) block_comment(error); stop(error)
62#endif
63
64#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
65
66#ifdef ASSERT
67bool AbstractAssembler::pd_check_instruction_mark() { return true; }
68#endif
69
70static Assembler::Condition reverse[] = {
71 Assembler::noOverflow /* overflow = 0x0 */ ,
72 Assembler::overflow /* noOverflow = 0x1 */ ,
73 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
74 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
75 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
76 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
77 Assembler::above /* belowEqual = 0x6 */ ,
78 Assembler::belowEqual /* above = 0x7 */ ,
79 Assembler::positive /* negative = 0x8 */ ,
80 Assembler::negative /* positive = 0x9 */ ,
81 Assembler::noParity /* parity = 0xa */ ,
82 Assembler::parity /* noParity = 0xb */ ,
83 Assembler::greaterEqual /* less = 0xc */ ,
84 Assembler::less /* greaterEqual = 0xd */ ,
85 Assembler::greater /* lessEqual = 0xe */ ,
86 Assembler::lessEqual /* greater = 0xf, */
87
88};
89
90
91// Implementation of MacroAssembler
92
93// First all the versions that have distinct versions depending on 32/64 bit
94// Unless the difference is trivial (1 line or so).
95
96#ifndef _LP64
97
98// 32bit versions
99
100Address MacroAssembler::as_Address(AddressLiteral adr) {
101 return Address(adr.target(), adr.rspec());
102}
103
104Address MacroAssembler::as_Address(ArrayAddress adr) {
105 return Address::make_array(adr);
106}
107
108void MacroAssembler::call_VM_leaf_base(address entry_point,
109 int number_of_arguments) {
110 call(RuntimeAddress(entry_point));
111 increment(rsp, number_of_arguments * wordSize);
112}
113
114void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
115 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
116}
117
118void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
119 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
120}
121
122void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
123 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
124}
125
126void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
127 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
128}
129
130void MacroAssembler::cmpoop(Address src1, jobject obj) {
131 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
132 bs->obj_equals(this, src1, obj);
133}
134
135void MacroAssembler::cmpoop(Register src1, jobject obj) {
136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
137 bs->obj_equals(this, src1, obj);
138}
139
140void MacroAssembler::extend_sign(Register hi, Register lo) {
141 // According to Intel Doc. AP-526, "Integer Divide", p.18.
142 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
143 cdql();
144 } else {
145 movl(hi, lo);
146 sarl(hi, 31);
147 }
148}
149
150void MacroAssembler::jC2(Register tmp, Label& L) {
151 // set parity bit if FPU flag C2 is set (via rax)
152 save_rax(tmp);
153 fwait(); fnstsw_ax();
154 sahf();
155 restore_rax(tmp);
156 // branch
157 jcc(Assembler::parity, L);
158}
159
160void MacroAssembler::jnC2(Register tmp, Label& L) {
161 // set parity bit if FPU flag C2 is set (via rax)
162 save_rax(tmp);
163 fwait(); fnstsw_ax();
164 sahf();
165 restore_rax(tmp);
166 // branch
167 jcc(Assembler::noParity, L);
168}
169
170// 32bit can do a case table jump in one instruction but we no longer allow the base
171// to be installed in the Address class
172void MacroAssembler::jump(ArrayAddress entry) {
173 jmp(as_Address(entry));
174}
175
176// Note: y_lo will be destroyed
177void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
178 // Long compare for Java (semantics as described in JVM spec.)
179 Label high, low, done;
180
181 cmpl(x_hi, y_hi);
182 jcc(Assembler::less, low);
183 jcc(Assembler::greater, high);
184 // x_hi is the return register
185 xorl(x_hi, x_hi);
186 cmpl(x_lo, y_lo);
187 jcc(Assembler::below, low);
188 jcc(Assembler::equal, done);
189
190 bind(high);
191 xorl(x_hi, x_hi);
192 increment(x_hi);
193 jmp(done);
194
195 bind(low);
196 xorl(x_hi, x_hi);
197 decrementl(x_hi);
198
199 bind(done);
200}
201
202void MacroAssembler::lea(Register dst, AddressLiteral src) {
203 mov_literal32(dst, (int32_t)src.target(), src.rspec());
204}
205
206void MacroAssembler::lea(Address dst, AddressLiteral adr) {
207 // leal(dst, as_Address(adr));
208 // see note in movl as to why we must use a move
209 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
210}
211
212void MacroAssembler::leave() {
213 mov(rsp, rbp);
214 pop(rbp);
215}
216
217void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
218 // Multiplication of two Java long values stored on the stack
219 // as illustrated below. Result is in rdx:rax.
220 //
221 // rsp ---> [ ?? ] \ \
222 // .... | y_rsp_offset |
223 // [ y_lo ] / (in bytes) | x_rsp_offset
224 // [ y_hi ] | (in bytes)
225 // .... |
226 // [ x_lo ] /
227 // [ x_hi ]
228 // ....
229 //
230 // Basic idea: lo(result) = lo(x_lo * y_lo)
231 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
232 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
233 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
234 Label quick;
235 // load x_hi, y_hi and check if quick
236 // multiplication is possible
237 movl(rbx, x_hi);
238 movl(rcx, y_hi);
239 movl(rax, rbx);
240 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
241 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
242 // do full multiplication
243 // 1st step
244 mull(y_lo); // x_hi * y_lo
245 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
246 // 2nd step
247 movl(rax, x_lo);
248 mull(rcx); // x_lo * y_hi
249 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
250 // 3rd step
251 bind(quick); // note: rbx, = 0 if quick multiply!
252 movl(rax, x_lo);
253 mull(y_lo); // x_lo * y_lo
254 addl(rdx, rbx); // correct hi(x_lo * y_lo)
255}
256
257void MacroAssembler::lneg(Register hi, Register lo) {
258 negl(lo);
259 adcl(hi, 0);
260 negl(hi);
261}
262
263void MacroAssembler::lshl(Register hi, Register lo) {
264 // Java shift left long support (semantics as described in JVM spec., p.305)
265 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
266 // shift value is in rcx !
267 assert(hi != rcx, "must not use rcx");
268 assert(lo != rcx, "must not use rcx");
269 const Register s = rcx; // shift count
270 const int n = BitsPerWord;
271 Label L;
272 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
273 cmpl(s, n); // if (s < n)
274 jcc(Assembler::less, L); // else (s >= n)
275 movl(hi, lo); // x := x << n
276 xorl(lo, lo);
277 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
278 bind(L); // s (mod n) < n
279 shldl(hi, lo); // x := x << s
280 shll(lo);
281}
282
283
284void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
285 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
286 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
287 assert(hi != rcx, "must not use rcx");
288 assert(lo != rcx, "must not use rcx");
289 const Register s = rcx; // shift count
290 const int n = BitsPerWord;
291 Label L;
292 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
293 cmpl(s, n); // if (s < n)
294 jcc(Assembler::less, L); // else (s >= n)
295 movl(lo, hi); // x := x >> n
296 if (sign_extension) sarl(hi, 31);
297 else xorl(hi, hi);
298 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
299 bind(L); // s (mod n) < n
300 shrdl(lo, hi); // x := x >> s
301 if (sign_extension) sarl(hi);
302 else shrl(hi);
303}
304
305void MacroAssembler::movoop(Register dst, jobject obj) {
306 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
307}
308
309void MacroAssembler::movoop(Address dst, jobject obj) {
310 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
311}
312
313void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
314 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
315}
316
317void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
318 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
319}
320
321void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
322 // scratch register is not used,
323 // it is defined to match parameters of 64-bit version of this method.
324 if (src.is_lval()) {
325 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
326 } else {
327 movl(dst, as_Address(src));
328 }
329}
330
331void MacroAssembler::movptr(ArrayAddress dst, Register src) {
332 movl(as_Address(dst), src);
333}
334
335void MacroAssembler::movptr(Register dst, ArrayAddress src) {
336 movl(dst, as_Address(src));
337}
338
339// src should NEVER be a real pointer. Use AddressLiteral for true pointers
340void MacroAssembler::movptr(Address dst, intptr_t src) {
341 movl(dst, src);
342}
343
344
345void MacroAssembler::pop_callee_saved_registers() {
346 pop(rcx);
347 pop(rdx);
348 pop(rdi);
349 pop(rsi);
350}
351
352void MacroAssembler::pop_fTOS() {
353 fld_d(Address(rsp, 0));
354 addl(rsp, 2 * wordSize);
355}
356
357void MacroAssembler::push_callee_saved_registers() {
358 push(rsi);
359 push(rdi);
360 push(rdx);
361 push(rcx);
362}
363
364void MacroAssembler::push_fTOS() {
365 subl(rsp, 2 * wordSize);
366 fstp_d(Address(rsp, 0));
367}
368
369
370void MacroAssembler::pushoop(jobject obj) {
371 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
372}
373
374void MacroAssembler::pushklass(Metadata* obj) {
375 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
376}
377
378void MacroAssembler::pushptr(AddressLiteral src) {
379 if (src.is_lval()) {
380 push_literal32((int32_t)src.target(), src.rspec());
381 } else {
382 pushl(as_Address(src));
383 }
384}
385
386void MacroAssembler::set_word_if_not_zero(Register dst) {
387 xorl(dst, dst);
388 set_byte_if_not_zero(dst);
389}
390
391static void pass_arg0(MacroAssembler* masm, Register arg) {
392 masm->push(arg);
393}
394
395static void pass_arg1(MacroAssembler* masm, Register arg) {
396 masm->push(arg);
397}
398
399static void pass_arg2(MacroAssembler* masm, Register arg) {
400 masm->push(arg);
401}
402
403static void pass_arg3(MacroAssembler* masm, Register arg) {
404 masm->push(arg);
405}
406
407#ifndef PRODUCT
408extern "C" void findpc(intptr_t x);
409#endif
410
411void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
412 // In order to get locks to work, we need to fake a in_VM state
413 JavaThread* thread = JavaThread::current();
414 JavaThreadState saved_state = thread->thread_state();
415 thread->set_thread_state(_thread_in_vm);
416 if (ShowMessageBoxOnError) {
417 JavaThread* thread = JavaThread::current();
418 JavaThreadState saved_state = thread->thread_state();
419 thread->set_thread_state(_thread_in_vm);
420 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
421 ttyLocker ttyl;
422 BytecodeCounter::print();
423 }
424 // To see where a verify_oop failed, get $ebx+40/X for this frame.
425 // This is the value of eip which points to where verify_oop will return.
426 if (os::message_box(msg, "Execution stopped, print registers?")) {
427 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
428 BREAKPOINT;
429 }
430 } else {
431 ttyLocker ttyl;
432 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
433 }
434 // Don't assert holding the ttyLock
435 assert(false, "DEBUG MESSAGE: %s", msg);
436 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
437}
438
439void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
440 ttyLocker ttyl;
441 FlagSetting fs(Debugging, true);
442 tty->print_cr("eip = 0x%08x", eip);
443#ifndef PRODUCT
444 if ((WizardMode || Verbose) && PrintMiscellaneous) {
445 tty->cr();
446 findpc(eip);
447 tty->cr();
448 }
449#endif
450#define PRINT_REG(rax) \
451 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
452 PRINT_REG(rax);
453 PRINT_REG(rbx);
454 PRINT_REG(rcx);
455 PRINT_REG(rdx);
456 PRINT_REG(rdi);
457 PRINT_REG(rsi);
458 PRINT_REG(rbp);
459 PRINT_REG(rsp);
460#undef PRINT_REG
461 // Print some words near top of staack.
462 int* dump_sp = (int*) rsp;
463 for (int col1 = 0; col1 < 8; col1++) {
464 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
465 os::print_location(tty, *dump_sp++);
466 }
467 for (int row = 0; row < 16; row++) {
468 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
469 for (int col = 0; col < 8; col++) {
470 tty->print(" 0x%08x", *dump_sp++);
471 }
472 tty->cr();
473 }
474 // Print some instructions around pc:
475 Disassembler::decode((address)eip-64, (address)eip);
476 tty->print_cr("--------");
477 Disassembler::decode((address)eip, (address)eip+32);
478}
479
480void MacroAssembler::stop(const char* msg) {
481 ExternalAddress message((address)msg);
482 // push address of message
483 pushptr(message.addr());
484 { Label L; call(L, relocInfo::none); bind(L); } // push eip
485 pusha(); // push registers
486 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
487 hlt();
488}
489
490void MacroAssembler::warn(const char* msg) {
491 push_CPU_state();
492
493 ExternalAddress message((address) msg);
494 // push address of message
495 pushptr(message.addr());
496
497 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
498 addl(rsp, wordSize); // discard argument
499 pop_CPU_state();
500}
501
502void MacroAssembler::print_state() {
503 { Label L; call(L, relocInfo::none); bind(L); } // push eip
504 pusha(); // push registers
505
506 push_CPU_state();
507 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
508 pop_CPU_state();
509
510 popa();
511 addl(rsp, wordSize);
512}
513
514#else // _LP64
515
516// 64 bit versions
517
518Address MacroAssembler::as_Address(AddressLiteral adr) {
519 // amd64 always does this as a pc-rel
520 // we can be absolute or disp based on the instruction type
521 // jmp/call are displacements others are absolute
522 assert(!adr.is_lval(), "must be rval");
523 assert(reachable(adr), "must be");
524 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
525
526}
527
528Address MacroAssembler::as_Address(ArrayAddress adr) {
529 AddressLiteral base = adr.base();
530 lea(rscratch1, base);
531 Address index = adr.index();
532 assert(index._disp == 0, "must not have disp"); // maybe it can?
533 Address array(rscratch1, index._index, index._scale, index._disp);
534 return array;
535}
536
537void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
538 Label L, E;
539
540#ifdef _WIN64
541 // Windows always allocates space for it's register args
542 assert(num_args <= 4, "only register arguments supported");
543 subq(rsp, frame::arg_reg_save_area_bytes);
544#endif
545
546 // Align stack if necessary
547 testl(rsp, 15);
548 jcc(Assembler::zero, L);
549
550 subq(rsp, 8);
551 {
552 call(RuntimeAddress(entry_point));
553 }
554 addq(rsp, 8);
555 jmp(E);
556
557 bind(L);
558 {
559 call(RuntimeAddress(entry_point));
560 }
561
562 bind(E);
563
564#ifdef _WIN64
565 // restore stack pointer
566 addq(rsp, frame::arg_reg_save_area_bytes);
567#endif
568
569}
570
571void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
572 assert(!src2.is_lval(), "should use cmpptr");
573
574 if (reachable(src2)) {
575 cmpq(src1, as_Address(src2));
576 } else {
577 lea(rscratch1, src2);
578 Assembler::cmpq(src1, Address(rscratch1, 0));
579 }
580}
581
582int MacroAssembler::corrected_idivq(Register reg) {
583 // Full implementation of Java ldiv and lrem; checks for special
584 // case as described in JVM spec., p.243 & p.271. The function
585 // returns the (pc) offset of the idivl instruction - may be needed
586 // for implicit exceptions.
587 //
588 // normal case special case
589 //
590 // input : rax: dividend min_long
591 // reg: divisor (may not be eax/edx) -1
592 //
593 // output: rax: quotient (= rax idiv reg) min_long
594 // rdx: remainder (= rax irem reg) 0
595 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
596 static const int64_t min_long = 0x8000000000000000;
597 Label normal_case, special_case;
598
599 // check for special case
600 cmp64(rax, ExternalAddress((address) &min_long));
601 jcc(Assembler::notEqual, normal_case);
602 xorl(rdx, rdx); // prepare rdx for possible special case (where
603 // remainder = 0)
604 cmpq(reg, -1);
605 jcc(Assembler::equal, special_case);
606
607 // handle normal case
608 bind(normal_case);
609 cdqq();
610 int idivq_offset = offset();
611 idivq(reg);
612
613 // normal and special case exit
614 bind(special_case);
615
616 return idivq_offset;
617}
618
619void MacroAssembler::decrementq(Register reg, int value) {
620 if (value == min_jint) { subq(reg, value); return; }
621 if (value < 0) { incrementq(reg, -value); return; }
622 if (value == 0) { ; return; }
623 if (value == 1 && UseIncDec) { decq(reg) ; return; }
624 /* else */ { subq(reg, value) ; return; }
625}
626
627void MacroAssembler::decrementq(Address dst, int value) {
628 if (value == min_jint) { subq(dst, value); return; }
629 if (value < 0) { incrementq(dst, -value); return; }
630 if (value == 0) { ; return; }
631 if (value == 1 && UseIncDec) { decq(dst) ; return; }
632 /* else */ { subq(dst, value) ; return; }
633}
634
635void MacroAssembler::incrementq(AddressLiteral dst) {
636 if (reachable(dst)) {
637 incrementq(as_Address(dst));
638 } else {
639 lea(rscratch1, dst);
640 incrementq(Address(rscratch1, 0));
641 }
642}
643
644void MacroAssembler::incrementq(Register reg, int value) {
645 if (value == min_jint) { addq(reg, value); return; }
646 if (value < 0) { decrementq(reg, -value); return; }
647 if (value == 0) { ; return; }
648 if (value == 1 && UseIncDec) { incq(reg) ; return; }
649 /* else */ { addq(reg, value) ; return; }
650}
651
652void MacroAssembler::incrementq(Address dst, int value) {
653 if (value == min_jint) { addq(dst, value); return; }
654 if (value < 0) { decrementq(dst, -value); return; }
655 if (value == 0) { ; return; }
656 if (value == 1 && UseIncDec) { incq(dst) ; return; }
657 /* else */ { addq(dst, value) ; return; }
658}
659
660// 32bit can do a case table jump in one instruction but we no longer allow the base
661// to be installed in the Address class
662void MacroAssembler::jump(ArrayAddress entry) {
663 lea(rscratch1, entry.base());
664 Address dispatch = entry.index();
665 assert(dispatch._base == noreg, "must be");
666 dispatch._base = rscratch1;
667 jmp(dispatch);
668}
669
670void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
671 ShouldNotReachHere(); // 64bit doesn't use two regs
672 cmpq(x_lo, y_lo);
673}
674
675void MacroAssembler::lea(Register dst, AddressLiteral src) {
676 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
677}
678
679void MacroAssembler::lea(Address dst, AddressLiteral adr) {
680 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
681 movptr(dst, rscratch1);
682}
683
684void MacroAssembler::leave() {
685 // %%% is this really better? Why not on 32bit too?
686 emit_int8((unsigned char)0xC9); // LEAVE
687}
688
689void MacroAssembler::lneg(Register hi, Register lo) {
690 ShouldNotReachHere(); // 64bit doesn't use two regs
691 negq(lo);
692}
693
694void MacroAssembler::movoop(Register dst, jobject obj) {
695 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
696}
697
698void MacroAssembler::movoop(Address dst, jobject obj) {
699 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
700 movq(dst, rscratch1);
701}
702
703void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
704 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
705}
706
707void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
708 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
709 movq(dst, rscratch1);
710}
711
712void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
713 if (src.is_lval()) {
714 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
715 } else {
716 if (reachable(src)) {
717 movq(dst, as_Address(src));
718 } else {
719 lea(scratch, src);
720 movq(dst, Address(scratch, 0));
721 }
722 }
723}
724
725void MacroAssembler::movptr(ArrayAddress dst, Register src) {
726 movq(as_Address(dst), src);
727}
728
729void MacroAssembler::movptr(Register dst, ArrayAddress src) {
730 movq(dst, as_Address(src));
731}
732
733// src should NEVER be a real pointer. Use AddressLiteral for true pointers
734void MacroAssembler::movptr(Address dst, intptr_t src) {
735 mov64(rscratch1, src);
736 movq(dst, rscratch1);
737}
738
739// These are mostly for initializing NULL
740void MacroAssembler::movptr(Address dst, int32_t src) {
741 movslq(dst, src);
742}
743
744void MacroAssembler::movptr(Register dst, int32_t src) {
745 mov64(dst, (intptr_t)src);
746}
747
748void MacroAssembler::pushoop(jobject obj) {
749 movoop(rscratch1, obj);
750 push(rscratch1);
751}
752
753void MacroAssembler::pushklass(Metadata* obj) {
754 mov_metadata(rscratch1, obj);
755 push(rscratch1);
756}
757
758void MacroAssembler::pushptr(AddressLiteral src) {
759 lea(rscratch1, src);
760 if (src.is_lval()) {
761 push(rscratch1);
762 } else {
763 pushq(Address(rscratch1, 0));
764 }
765}
766
767void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
768 // we must set sp to zero to clear frame
769 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
770 // must clear fp, so that compiled frames are not confused; it is
771 // possible that we need it only for debugging
772 if (clear_fp) {
773 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
774 }
775
776 // Always clear the pc because it could have been set by make_walkable()
777 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
778 vzeroupper();
779}
780
781void MacroAssembler::set_last_Java_frame(Register last_java_sp,
782 Register last_java_fp,
783 address last_java_pc) {
784 vzeroupper();
785 // determine last_java_sp register
786 if (!last_java_sp->is_valid()) {
787 last_java_sp = rsp;
788 }
789
790 // last_java_fp is optional
791 if (last_java_fp->is_valid()) {
792 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
793 last_java_fp);
794 }
795
796 // last_java_pc is optional
797 if (last_java_pc != NULL) {
798 Address java_pc(r15_thread,
799 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
800 lea(rscratch1, InternalAddress(last_java_pc));
801 movptr(java_pc, rscratch1);
802 }
803
804 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
805}
806
807static void pass_arg0(MacroAssembler* masm, Register arg) {
808 if (c_rarg0 != arg ) {
809 masm->mov(c_rarg0, arg);
810 }
811}
812
813static void pass_arg1(MacroAssembler* masm, Register arg) {
814 if (c_rarg1 != arg ) {
815 masm->mov(c_rarg1, arg);
816 }
817}
818
819static void pass_arg2(MacroAssembler* masm, Register arg) {
820 if (c_rarg2 != arg ) {
821 masm->mov(c_rarg2, arg);
822 }
823}
824
825static void pass_arg3(MacroAssembler* masm, Register arg) {
826 if (c_rarg3 != arg ) {
827 masm->mov(c_rarg3, arg);
828 }
829}
830
831void MacroAssembler::stop(const char* msg) {
832 address rip = pc();
833 pusha(); // get regs on stack
834 lea(c_rarg0, ExternalAddress((address) msg));
835 lea(c_rarg1, InternalAddress(rip));
836 movq(c_rarg2, rsp); // pass pointer to regs array
837 andq(rsp, -16); // align stack as required by ABI
838 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
839 hlt();
840}
841
842void MacroAssembler::warn(const char* msg) {
843 push(rbp);
844 movq(rbp, rsp);
845 andq(rsp, -16); // align stack as required by push_CPU_state and call
846 push_CPU_state(); // keeps alignment at 16 bytes
847 lea(c_rarg0, ExternalAddress((address) msg));
848 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
849 call(rax);
850 pop_CPU_state();
851 mov(rsp, rbp);
852 pop(rbp);
853}
854
855void MacroAssembler::print_state() {
856 address rip = pc();
857 pusha(); // get regs on stack
858 push(rbp);
859 movq(rbp, rsp);
860 andq(rsp, -16); // align stack as required by push_CPU_state and call
861 push_CPU_state(); // keeps alignment at 16 bytes
862
863 lea(c_rarg0, InternalAddress(rip));
864 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
865 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
866
867 pop_CPU_state();
868 mov(rsp, rbp);
869 pop(rbp);
870 popa();
871}
872
873#ifndef PRODUCT
874extern "C" void findpc(intptr_t x);
875#endif
876
877void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
878 // In order to get locks to work, we need to fake a in_VM state
879 if (ShowMessageBoxOnError) {
880 JavaThread* thread = JavaThread::current();
881 JavaThreadState saved_state = thread->thread_state();
882 thread->set_thread_state(_thread_in_vm);
883#ifndef PRODUCT
884 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
885 ttyLocker ttyl;
886 BytecodeCounter::print();
887 }
888#endif
889 // To see where a verify_oop failed, get $ebx+40/X for this frame.
890 // XXX correct this offset for amd64
891 // This is the value of eip which points to where verify_oop will return.
892 if (os::message_box(msg, "Execution stopped, print registers?")) {
893 print_state64(pc, regs);
894 BREAKPOINT;
895 assert(false, "start up GDB");
896 }
897 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
898 } else {
899 ttyLocker ttyl;
900 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
901 msg);
902 assert(false, "DEBUG MESSAGE: %s", msg);
903 }
904}
905
906void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
907 ttyLocker ttyl;
908 FlagSetting fs(Debugging, true);
909 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
910#ifndef PRODUCT
911 tty->cr();
912 findpc(pc);
913 tty->cr();
914#endif
915#define PRINT_REG(rax, value) \
916 { tty->print("%s = ", #rax); os::print_location(tty, value); }
917 PRINT_REG(rax, regs[15]);
918 PRINT_REG(rbx, regs[12]);
919 PRINT_REG(rcx, regs[14]);
920 PRINT_REG(rdx, regs[13]);
921 PRINT_REG(rdi, regs[8]);
922 PRINT_REG(rsi, regs[9]);
923 PRINT_REG(rbp, regs[10]);
924 PRINT_REG(rsp, regs[11]);
925 PRINT_REG(r8 , regs[7]);
926 PRINT_REG(r9 , regs[6]);
927 PRINT_REG(r10, regs[5]);
928 PRINT_REG(r11, regs[4]);
929 PRINT_REG(r12, regs[3]);
930 PRINT_REG(r13, regs[2]);
931 PRINT_REG(r14, regs[1]);
932 PRINT_REG(r15, regs[0]);
933#undef PRINT_REG
934 // Print some words near top of staack.
935 int64_t* rsp = (int64_t*) regs[11];
936 int64_t* dump_sp = rsp;
937 for (int col1 = 0; col1 < 8; col1++) {
938 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
939 os::print_location(tty, *dump_sp++);
940 }
941 for (int row = 0; row < 25; row++) {
942 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
943 for (int col = 0; col < 4; col++) {
944 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
945 }
946 tty->cr();
947 }
948 // Print some instructions around pc:
949 Disassembler::decode((address)pc-64, (address)pc);
950 tty->print_cr("--------");
951 Disassembler::decode((address)pc, (address)pc+32);
952}
953
954#endif // _LP64
955
956// Now versions that are common to 32/64 bit
957
958void MacroAssembler::addptr(Register dst, int32_t imm32) {
959 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
960}
961
962void MacroAssembler::addptr(Register dst, Register src) {
963 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
964}
965
966void MacroAssembler::addptr(Address dst, Register src) {
967 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
968}
969
970void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
971 if (reachable(src)) {
972 Assembler::addsd(dst, as_Address(src));
973 } else {
974 lea(rscratch1, src);
975 Assembler::addsd(dst, Address(rscratch1, 0));
976 }
977}
978
979void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
980 if (reachable(src)) {
981 addss(dst, as_Address(src));
982 } else {
983 lea(rscratch1, src);
984 addss(dst, Address(rscratch1, 0));
985 }
986}
987
988void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
989 if (reachable(src)) {
990 Assembler::addpd(dst, as_Address(src));
991 } else {
992 lea(rscratch1, src);
993 Assembler::addpd(dst, Address(rscratch1, 0));
994 }
995}
996
997void MacroAssembler::align(int modulus) {
998 align(modulus, offset());
999}
1000
1001void MacroAssembler::align(int modulus, int target) {
1002 if (target % modulus != 0) {
1003 nop(modulus - (target % modulus));
1004 }
1005}
1006
1007void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1008 // Used in sign-masking with aligned address.
1009 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1010 if (reachable(src)) {
1011 Assembler::andpd(dst, as_Address(src));
1012 } else {
1013 lea(scratch_reg, src);
1014 Assembler::andpd(dst, Address(scratch_reg, 0));
1015 }
1016}
1017
1018void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1019 // Used in sign-masking with aligned address.
1020 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1021 if (reachable(src)) {
1022 Assembler::andps(dst, as_Address(src));
1023 } else {
1024 lea(scratch_reg, src);
1025 Assembler::andps(dst, Address(scratch_reg, 0));
1026 }
1027}
1028
1029void MacroAssembler::andptr(Register dst, int32_t imm32) {
1030 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1031}
1032
1033void MacroAssembler::atomic_incl(Address counter_addr) {
1034 lock();
1035 incrementl(counter_addr);
1036}
1037
1038void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1039 if (reachable(counter_addr)) {
1040 atomic_incl(as_Address(counter_addr));
1041 } else {
1042 lea(scr, counter_addr);
1043 atomic_incl(Address(scr, 0));
1044 }
1045}
1046
1047#ifdef _LP64
1048void MacroAssembler::atomic_incq(Address counter_addr) {
1049 lock();
1050 incrementq(counter_addr);
1051}
1052
1053void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1054 if (reachable(counter_addr)) {
1055 atomic_incq(as_Address(counter_addr));
1056 } else {
1057 lea(scr, counter_addr);
1058 atomic_incq(Address(scr, 0));
1059 }
1060}
1061#endif
1062
1063// Writes to stack successive pages until offset reached to check for
1064// stack overflow + shadow pages. This clobbers tmp.
1065void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1066 movptr(tmp, rsp);
1067 // Bang stack for total size given plus shadow page size.
1068 // Bang one page at a time because large size can bang beyond yellow and
1069 // red zones.
1070 Label loop;
1071 bind(loop);
1072 movl(Address(tmp, (-os::vm_page_size())), size );
1073 subptr(tmp, os::vm_page_size());
1074 subl(size, os::vm_page_size());
1075 jcc(Assembler::greater, loop);
1076
1077 // Bang down shadow pages too.
1078 // At this point, (tmp-0) is the last address touched, so don't
1079 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1080 // was post-decremented.) Skip this address by starting at i=1, and
1081 // touch a few more pages below. N.B. It is important to touch all
1082 // the way down including all pages in the shadow zone.
1083 for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1084 // this could be any sized move but this is can be a debugging crumb
1085 // so the bigger the better.
1086 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1087 }
1088}
1089
1090void MacroAssembler::reserved_stack_check() {
1091 // testing if reserved zone needs to be enabled
1092 Label no_reserved_zone_enabling;
1093 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1094 NOT_LP64(get_thread(rsi);)
1095
1096 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1097 jcc(Assembler::below, no_reserved_zone_enabling);
1098
1099 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1100 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1101 should_not_reach_here();
1102
1103 bind(no_reserved_zone_enabling);
1104}
1105
1106int MacroAssembler::biased_locking_enter(Register lock_reg,
1107 Register obj_reg,
1108 Register swap_reg,
1109 Register tmp_reg,
1110 bool swap_reg_contains_mark,
1111 Label& done,
1112 Label* slow_case,
1113 BiasedLockingCounters* counters) {
1114 assert(UseBiasedLocking, "why call this otherwise?");
1115 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1116 assert(tmp_reg != noreg, "tmp_reg must be supplied");
1117 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1118 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1119 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1120 NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1121
1122 if (PrintBiasedLockingStatistics && counters == NULL) {
1123 counters = BiasedLocking::counters();
1124 }
1125 // Biased locking
1126 // See whether the lock is currently biased toward our thread and
1127 // whether the epoch is still valid
1128 // Note that the runtime guarantees sufficient alignment of JavaThread
1129 // pointers to allow age to be placed into low bits
1130 // First check to see whether biasing is even enabled for this object
1131 Label cas_label;
1132 int null_check_offset = -1;
1133 if (!swap_reg_contains_mark) {
1134 null_check_offset = offset();
1135 movptr(swap_reg, mark_addr);
1136 }
1137 movptr(tmp_reg, swap_reg);
1138 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1139 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1140 jcc(Assembler::notEqual, cas_label);
1141 // The bias pattern is present in the object's header. Need to check
1142 // whether the bias owner and the epoch are both still current.
1143#ifndef _LP64
1144 // Note that because there is no current thread register on x86_32 we
1145 // need to store off the mark word we read out of the object to
1146 // avoid reloading it and needing to recheck invariants below. This
1147 // store is unfortunate but it makes the overall code shorter and
1148 // simpler.
1149 movptr(saved_mark_addr, swap_reg);
1150#endif
1151 if (swap_reg_contains_mark) {
1152 null_check_offset = offset();
1153 }
1154 load_prototype_header(tmp_reg, obj_reg);
1155#ifdef _LP64
1156 orptr(tmp_reg, r15_thread);
1157 xorptr(tmp_reg, swap_reg);
1158 Register header_reg = tmp_reg;
1159#else
1160 xorptr(tmp_reg, swap_reg);
1161 get_thread(swap_reg);
1162 xorptr(swap_reg, tmp_reg);
1163 Register header_reg = swap_reg;
1164#endif
1165 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1166 if (counters != NULL) {
1167 cond_inc32(Assembler::zero,
1168 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1169 }
1170 jcc(Assembler::equal, done);
1171
1172 Label try_revoke_bias;
1173 Label try_rebias;
1174
1175 // At this point we know that the header has the bias pattern and
1176 // that we are not the bias owner in the current epoch. We need to
1177 // figure out more details about the state of the header in order to
1178 // know what operations can be legally performed on the object's
1179 // header.
1180
1181 // If the low three bits in the xor result aren't clear, that means
1182 // the prototype header is no longer biased and we have to revoke
1183 // the bias on this object.
1184 testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1185 jccb(Assembler::notZero, try_revoke_bias);
1186
1187 // Biasing is still enabled for this data type. See whether the
1188 // epoch of the current bias is still valid, meaning that the epoch
1189 // bits of the mark word are equal to the epoch bits of the
1190 // prototype header. (Note that the prototype header's epoch bits
1191 // only change at a safepoint.) If not, attempt to rebias the object
1192 // toward the current thread. Note that we must be absolutely sure
1193 // that the current epoch is invalid in order to do this because
1194 // otherwise the manipulations it performs on the mark word are
1195 // illegal.
1196 testptr(header_reg, markOopDesc::epoch_mask_in_place);
1197 jccb(Assembler::notZero, try_rebias);
1198
1199 // The epoch of the current bias is still valid but we know nothing
1200 // about the owner; it might be set or it might be clear. Try to
1201 // acquire the bias of the object using an atomic operation. If this
1202 // fails we will go in to the runtime to revoke the object's bias.
1203 // Note that we first construct the presumed unbiased header so we
1204 // don't accidentally blow away another thread's valid bias.
1205 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1206 andptr(swap_reg,
1207 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1208#ifdef _LP64
1209 movptr(tmp_reg, swap_reg);
1210 orptr(tmp_reg, r15_thread);
1211#else
1212 get_thread(tmp_reg);
1213 orptr(tmp_reg, swap_reg);
1214#endif
1215 lock();
1216 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1217 // If the biasing toward our thread failed, this means that
1218 // another thread succeeded in biasing it toward itself and we
1219 // need to revoke that bias. The revocation will occur in the
1220 // interpreter runtime in the slow case.
1221 if (counters != NULL) {
1222 cond_inc32(Assembler::zero,
1223 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1224 }
1225 if (slow_case != NULL) {
1226 jcc(Assembler::notZero, *slow_case);
1227 }
1228 jmp(done);
1229
1230 bind(try_rebias);
1231 // At this point we know the epoch has expired, meaning that the
1232 // current "bias owner", if any, is actually invalid. Under these
1233 // circumstances _only_, we are allowed to use the current header's
1234 // value as the comparison value when doing the cas to acquire the
1235 // bias in the current epoch. In other words, we allow transfer of
1236 // the bias from one thread to another directly in this situation.
1237 //
1238 // FIXME: due to a lack of registers we currently blow away the age
1239 // bits in this situation. Should attempt to preserve them.
1240 load_prototype_header(tmp_reg, obj_reg);
1241#ifdef _LP64
1242 orptr(tmp_reg, r15_thread);
1243#else
1244 get_thread(swap_reg);
1245 orptr(tmp_reg, swap_reg);
1246 movptr(swap_reg, saved_mark_addr);
1247#endif
1248 lock();
1249 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1250 // If the biasing toward our thread failed, then another thread
1251 // succeeded in biasing it toward itself and we need to revoke that
1252 // bias. The revocation will occur in the runtime in the slow case.
1253 if (counters != NULL) {
1254 cond_inc32(Assembler::zero,
1255 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1256 }
1257 if (slow_case != NULL) {
1258 jcc(Assembler::notZero, *slow_case);
1259 }
1260 jmp(done);
1261
1262 bind(try_revoke_bias);
1263 // The prototype mark in the klass doesn't have the bias bit set any
1264 // more, indicating that objects of this data type are not supposed
1265 // to be biased any more. We are going to try to reset the mark of
1266 // this object to the prototype value and fall through to the
1267 // CAS-based locking scheme. Note that if our CAS fails, it means
1268 // that another thread raced us for the privilege of revoking the
1269 // bias of this particular object, so it's okay to continue in the
1270 // normal locking code.
1271 //
1272 // FIXME: due to a lack of registers we currently blow away the age
1273 // bits in this situation. Should attempt to preserve them.
1274 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1275 load_prototype_header(tmp_reg, obj_reg);
1276 lock();
1277 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1278 // Fall through to the normal CAS-based lock, because no matter what
1279 // the result of the above CAS, some thread must have succeeded in
1280 // removing the bias bit from the object's header.
1281 if (counters != NULL) {
1282 cond_inc32(Assembler::zero,
1283 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1284 }
1285
1286 bind(cas_label);
1287
1288 return null_check_offset;
1289}
1290
1291void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1292 assert(UseBiasedLocking, "why call this otherwise?");
1293
1294 // Check for biased locking unlock case, which is a no-op
1295 // Note: we do not have to check the thread ID for two reasons.
1296 // First, the interpreter checks for IllegalMonitorStateException at
1297 // a higher level. Second, if the bias was revoked while we held the
1298 // lock, the object could not be rebiased toward another thread, so
1299 // the bias bit would be clear.
1300 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1301 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1302 cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1303 jcc(Assembler::equal, done);
1304}
1305
1306#ifdef COMPILER2
1307
1308#if INCLUDE_RTM_OPT
1309
1310// Update rtm_counters based on abort status
1311// input: abort_status
1312// rtm_counters (RTMLockingCounters*)
1313// flags are killed
1314void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1315
1316 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1317 if (PrintPreciseRTMLockingStatistics) {
1318 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1319 Label check_abort;
1320 testl(abort_status, (1<<i));
1321 jccb(Assembler::equal, check_abort);
1322 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1323 bind(check_abort);
1324 }
1325 }
1326}
1327
1328// Branch if (random & (count-1) != 0), count is 2^n
1329// tmp, scr and flags are killed
1330void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1331 assert(tmp == rax, "");
1332 assert(scr == rdx, "");
1333 rdtsc(); // modifies EDX:EAX
1334 andptr(tmp, count-1);
1335 jccb(Assembler::notZero, brLabel);
1336}
1337
1338// Perform abort ratio calculation, set no_rtm bit if high ratio
1339// input: rtm_counters_Reg (RTMLockingCounters* address)
1340// tmpReg, rtm_counters_Reg and flags are killed
1341void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1342 Register rtm_counters_Reg,
1343 RTMLockingCounters* rtm_counters,
1344 Metadata* method_data) {
1345 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1346
1347 if (RTMLockingCalculationDelay > 0) {
1348 // Delay calculation
1349 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1350 testptr(tmpReg, tmpReg);
1351 jccb(Assembler::equal, L_done);
1352 }
1353 // Abort ratio calculation only if abort_count > RTMAbortThreshold
1354 // Aborted transactions = abort_count * 100
1355 // All transactions = total_count * RTMTotalCountIncrRate
1356 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1357
1358 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1359 cmpptr(tmpReg, RTMAbortThreshold);
1360 jccb(Assembler::below, L_check_always_rtm2);
1361 imulptr(tmpReg, tmpReg, 100);
1362
1363 Register scrReg = rtm_counters_Reg;
1364 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1365 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1366 imulptr(scrReg, scrReg, RTMAbortRatio);
1367 cmpptr(tmpReg, scrReg);
1368 jccb(Assembler::below, L_check_always_rtm1);
1369 if (method_data != NULL) {
1370 // set rtm_state to "no rtm" in MDO
1371 mov_metadata(tmpReg, method_data);
1372 lock();
1373 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1374 }
1375 jmpb(L_done);
1376 bind(L_check_always_rtm1);
1377 // Reload RTMLockingCounters* address
1378 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1379 bind(L_check_always_rtm2);
1380 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1381 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1382 jccb(Assembler::below, L_done);
1383 if (method_data != NULL) {
1384 // set rtm_state to "always rtm" in MDO
1385 mov_metadata(tmpReg, method_data);
1386 lock();
1387 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1388 }
1389 bind(L_done);
1390}
1391
1392// Update counters and perform abort ratio calculation
1393// input: abort_status_Reg
1394// rtm_counters_Reg, flags are killed
1395void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1396 Register rtm_counters_Reg,
1397 RTMLockingCounters* rtm_counters,
1398 Metadata* method_data,
1399 bool profile_rtm) {
1400
1401 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1402 // update rtm counters based on rax value at abort
1403 // reads abort_status_Reg, updates flags
1404 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1405 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1406 if (profile_rtm) {
1407 // Save abort status because abort_status_Reg is used by following code.
1408 if (RTMRetryCount > 0) {
1409 push(abort_status_Reg);
1410 }
1411 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1412 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1413 // restore abort status
1414 if (RTMRetryCount > 0) {
1415 pop(abort_status_Reg);
1416 }
1417 }
1418}
1419
1420// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1421// inputs: retry_count_Reg
1422// : abort_status_Reg
1423// output: retry_count_Reg decremented by 1
1424// flags are killed
1425void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1426 Label doneRetry;
1427 assert(abort_status_Reg == rax, "");
1428 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1429 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1430 // if reason is in 0x6 and retry count != 0 then retry
1431 andptr(abort_status_Reg, 0x6);
1432 jccb(Assembler::zero, doneRetry);
1433 testl(retry_count_Reg, retry_count_Reg);
1434 jccb(Assembler::zero, doneRetry);
1435 pause();
1436 decrementl(retry_count_Reg);
1437 jmp(retryLabel);
1438 bind(doneRetry);
1439}
1440
1441// Spin and retry if lock is busy,
1442// inputs: box_Reg (monitor address)
1443// : retry_count_Reg
1444// output: retry_count_Reg decremented by 1
1445// : clear z flag if retry count exceeded
1446// tmp_Reg, scr_Reg, flags are killed
1447void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1448 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1449 Label SpinLoop, SpinExit, doneRetry;
1450 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1451
1452 testl(retry_count_Reg, retry_count_Reg);
1453 jccb(Assembler::zero, doneRetry);
1454 decrementl(retry_count_Reg);
1455 movptr(scr_Reg, RTMSpinLoopCount);
1456
1457 bind(SpinLoop);
1458 pause();
1459 decrementl(scr_Reg);
1460 jccb(Assembler::lessEqual, SpinExit);
1461 movptr(tmp_Reg, Address(box_Reg, owner_offset));
1462 testptr(tmp_Reg, tmp_Reg);
1463 jccb(Assembler::notZero, SpinLoop);
1464
1465 bind(SpinExit);
1466 jmp(retryLabel);
1467 bind(doneRetry);
1468 incrementl(retry_count_Reg); // clear z flag
1469}
1470
1471// Use RTM for normal stack locks
1472// Input: objReg (object to lock)
1473void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1474 Register retry_on_abort_count_Reg,
1475 RTMLockingCounters* stack_rtm_counters,
1476 Metadata* method_data, bool profile_rtm,
1477 Label& DONE_LABEL, Label& IsInflated) {
1478 assert(UseRTMForStackLocks, "why call this otherwise?");
1479 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1480 assert(tmpReg == rax, "");
1481 assert(scrReg == rdx, "");
1482 Label L_rtm_retry, L_decrement_retry, L_on_abort;
1483
1484 if (RTMRetryCount > 0) {
1485 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1486 bind(L_rtm_retry);
1487 }
1488 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1489 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1490 jcc(Assembler::notZero, IsInflated);
1491
1492 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1493 Label L_noincrement;
1494 if (RTMTotalCountIncrRate > 1) {
1495 // tmpReg, scrReg and flags are killed
1496 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1497 }
1498 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1499 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1500 bind(L_noincrement);
1501 }
1502 xbegin(L_on_abort);
1503 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1504 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1505 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
1506 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
1507
1508 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1509 if (UseRTMXendForLockBusy) {
1510 xend();
1511 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
1512 jmp(L_decrement_retry);
1513 }
1514 else {
1515 xabort(0);
1516 }
1517 bind(L_on_abort);
1518 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1519 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1520 }
1521 bind(L_decrement_retry);
1522 if (RTMRetryCount > 0) {
1523 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1524 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1525 }
1526}
1527
1528// Use RTM for inflating locks
1529// inputs: objReg (object to lock)
1530// boxReg (on-stack box address (displaced header location) - KILLED)
1531// tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1532void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1533 Register scrReg, Register retry_on_busy_count_Reg,
1534 Register retry_on_abort_count_Reg,
1535 RTMLockingCounters* rtm_counters,
1536 Metadata* method_data, bool profile_rtm,
1537 Label& DONE_LABEL) {
1538 assert(UseRTMLocking, "why call this otherwise?");
1539 assert(tmpReg == rax, "");
1540 assert(scrReg == rdx, "");
1541 Label L_rtm_retry, L_decrement_retry, L_on_abort;
1542 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1543
1544 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1545 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1546 movptr(boxReg, tmpReg); // Save ObjectMonitor address
1547
1548 if (RTMRetryCount > 0) {
1549 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
1550 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1551 bind(L_rtm_retry);
1552 }
1553 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1554 Label L_noincrement;
1555 if (RTMTotalCountIncrRate > 1) {
1556 // tmpReg, scrReg and flags are killed
1557 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1558 }
1559 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1560 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1561 bind(L_noincrement);
1562 }
1563 xbegin(L_on_abort);
1564 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1565 movptr(tmpReg, Address(tmpReg, owner_offset));
1566 testptr(tmpReg, tmpReg);
1567 jcc(Assembler::zero, DONE_LABEL);
1568 if (UseRTMXendForLockBusy) {
1569 xend();
1570 jmp(L_decrement_retry);
1571 }
1572 else {
1573 xabort(0);
1574 }
1575 bind(L_on_abort);
1576 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1577 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1578 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1579 }
1580 if (RTMRetryCount > 0) {
1581 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1582 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1583 }
1584
1585 movptr(tmpReg, Address(boxReg, owner_offset)) ;
1586 testptr(tmpReg, tmpReg) ;
1587 jccb(Assembler::notZero, L_decrement_retry) ;
1588
1589 // Appears unlocked - try to swing _owner from null to non-null.
1590 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1591#ifdef _LP64
1592 Register threadReg = r15_thread;
1593#else
1594 get_thread(scrReg);
1595 Register threadReg = scrReg;
1596#endif
1597 lock();
1598 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1599
1600 if (RTMRetryCount > 0) {
1601 // success done else retry
1602 jccb(Assembler::equal, DONE_LABEL) ;
1603 bind(L_decrement_retry);
1604 // Spin and retry if lock is busy.
1605 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1606 }
1607 else {
1608 bind(L_decrement_retry);
1609 }
1610}
1611
1612#endif // INCLUDE_RTM_OPT
1613
1614// Fast_Lock and Fast_Unlock used by C2
1615
1616// Because the transitions from emitted code to the runtime
1617// monitorenter/exit helper stubs are so slow it's critical that
1618// we inline both the stack-locking fast-path and the inflated fast path.
1619//
1620// See also: cmpFastLock and cmpFastUnlock.
1621//
1622// What follows is a specialized inline transliteration of the code
1623// in slow_enter() and slow_exit(). If we're concerned about I$ bloat
1624// another option would be to emit TrySlowEnter and TrySlowExit methods
1625// at startup-time. These methods would accept arguments as
1626// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1627// indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
1628// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1629// In practice, however, the # of lock sites is bounded and is usually small.
1630// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1631// if the processor uses simple bimodal branch predictors keyed by EIP
1632// Since the helper routines would be called from multiple synchronization
1633// sites.
1634//
1635// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1636// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1637// to those specialized methods. That'd give us a mostly platform-independent
1638// implementation that the JITs could optimize and inline at their pleasure.
1639// Done correctly, the only time we'd need to cross to native could would be
1640// to park() or unpark() threads. We'd also need a few more unsafe operators
1641// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1642// (b) explicit barriers or fence operations.
1643//
1644// TODO:
1645//
1646// * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1647// This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1648// Given TLAB allocation, Self is usually manifested in a register, so passing it into
1649// the lock operators would typically be faster than reifying Self.
1650//
1651// * Ideally I'd define the primitives as:
1652// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1653// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1654// Unfortunately ADLC bugs prevent us from expressing the ideal form.
1655// Instead, we're stuck with a rather awkward and brittle register assignments below.
1656// Furthermore the register assignments are overconstrained, possibly resulting in
1657// sub-optimal code near the synchronization site.
1658//
1659// * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1660// Alternately, use a better sp-proximity test.
1661//
1662// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1663// Either one is sufficient to uniquely identify a thread.
1664// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1665//
1666// * Intrinsify notify() and notifyAll() for the common cases where the
1667// object is locked by the calling thread but the waitlist is empty.
1668// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1669//
1670// * use jccb and jmpb instead of jcc and jmp to improve code density.
1671// But beware of excessive branch density on AMD Opterons.
1672//
1673// * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1674// or failure of the fast-path. If the fast-path fails then we pass
1675// control to the slow-path, typically in C. In Fast_Lock and
1676// Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1677// will emit a conditional branch immediately after the node.
1678// So we have branches to branches and lots of ICC.ZF games.
1679// Instead, it might be better to have C2 pass a "FailureLabel"
1680// into Fast_Lock and Fast_Unlock. In the case of success, control
1681// will drop through the node. ICC.ZF is undefined at exit.
1682// In the case of failure, the node will branch directly to the
1683// FailureLabel
1684
1685
1686// obj: object to lock
1687// box: on-stack box address (displaced header location) - KILLED
1688// rax,: tmp -- KILLED
1689// scr: tmp -- KILLED
1690void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1691 Register scrReg, Register cx1Reg, Register cx2Reg,
1692 BiasedLockingCounters* counters,
1693 RTMLockingCounters* rtm_counters,
1694 RTMLockingCounters* stack_rtm_counters,
1695 Metadata* method_data,
1696 bool use_rtm, bool profile_rtm) {
1697 // Ensure the register assignments are disjoint
1698 assert(tmpReg == rax, "");
1699
1700 if (use_rtm) {
1701 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1702 } else {
1703 assert(cx1Reg == noreg, "");
1704 assert(cx2Reg == noreg, "");
1705 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1706 }
1707
1708 if (counters != NULL) {
1709 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1710 }
1711
1712 // Possible cases that we'll encounter in fast_lock
1713 // ------------------------------------------------
1714 // * Inflated
1715 // -- unlocked
1716 // -- Locked
1717 // = by self
1718 // = by other
1719 // * biased
1720 // -- by Self
1721 // -- by other
1722 // * neutral
1723 // * stack-locked
1724 // -- by self
1725 // = sp-proximity test hits
1726 // = sp-proximity test generates false-negative
1727 // -- by other
1728 //
1729
1730 Label IsInflated, DONE_LABEL;
1731
1732 // it's stack-locked, biased or neutral
1733 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1734 // order to reduce the number of conditional branches in the most common cases.
1735 // Beware -- there's a subtle invariant that fetch of the markword
1736 // at [FETCH], below, will never observe a biased encoding (*101b).
1737 // If this invariant is not held we risk exclusion (safety) failure.
1738 if (UseBiasedLocking && !UseOptoBiasInlining) {
1739 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1740 }
1741
1742#if INCLUDE_RTM_OPT
1743 if (UseRTMForStackLocks && use_rtm) {
1744 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1745 stack_rtm_counters, method_data, profile_rtm,
1746 DONE_LABEL, IsInflated);
1747 }
1748#endif // INCLUDE_RTM_OPT
1749
1750 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
1751 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1752 jccb(Assembler::notZero, IsInflated);
1753
1754 // Attempt stack-locking ...
1755 orptr (tmpReg, markOopDesc::unlocked_value);
1756 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1757 lock();
1758 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
1759 if (counters != NULL) {
1760 cond_inc32(Assembler::equal,
1761 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1762 }
1763 jcc(Assembler::equal, DONE_LABEL); // Success
1764
1765 // Recursive locking.
1766 // The object is stack-locked: markword contains stack pointer to BasicLock.
1767 // Locked by current thread if difference with current SP is less than one page.
1768 subptr(tmpReg, rsp);
1769 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1770 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1771 movptr(Address(boxReg, 0), tmpReg);
1772 if (counters != NULL) {
1773 cond_inc32(Assembler::equal,
1774 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1775 }
1776 jmp(DONE_LABEL);
1777
1778 bind(IsInflated);
1779 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1780
1781#if INCLUDE_RTM_OPT
1782 // Use the same RTM locking code in 32- and 64-bit VM.
1783 if (use_rtm) {
1784 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1785 rtm_counters, method_data, profile_rtm, DONE_LABEL);
1786 } else {
1787#endif // INCLUDE_RTM_OPT
1788
1789#ifndef _LP64
1790 // The object is inflated.
1791
1792 // boxReg refers to the on-stack BasicLock in the current frame.
1793 // We'd like to write:
1794 // set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices.
1795 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1796 // additional latency as we have another ST in the store buffer that must drain.
1797
1798 // avoid ST-before-CAS
1799 // register juggle because we need tmpReg for cmpxchgptr below
1800 movptr(scrReg, boxReg);
1801 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1802
1803 // Optimistic form: consider XORL tmpReg,tmpReg
1804 movptr(tmpReg, NULL_WORD);
1805
1806 // Appears unlocked - try to swing _owner from null to non-null.
1807 // Ideally, I'd manifest "Self" with get_thread and then attempt
1808 // to CAS the register containing Self into m->Owner.
1809 // But we don't have enough registers, so instead we can either try to CAS
1810 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1811 // we later store "Self" into m->Owner. Transiently storing a stack address
1812 // (rsp or the address of the box) into m->owner is harmless.
1813 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1814 lock();
1815 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1816 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1817 // If we weren't able to swing _owner from NULL to the BasicLock
1818 // then take the slow path.
1819 jccb (Assembler::notZero, DONE_LABEL);
1820 // update _owner from BasicLock to thread
1821 get_thread (scrReg); // beware: clobbers ICCs
1822 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1823 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1824
1825 // If the CAS fails we can either retry or pass control to the slow-path.
1826 // We use the latter tactic.
1827 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1828 // If the CAS was successful ...
1829 // Self has acquired the lock
1830 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1831 // Intentional fall-through into DONE_LABEL ...
1832#else // _LP64
1833 // It's inflated
1834 movq(scrReg, tmpReg);
1835 xorq(tmpReg, tmpReg);
1836
1837 lock();
1838 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1839 // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1840 // Without cast to int32_t movptr will destroy r10 which is typically obj.
1841 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1842 // Intentional fall-through into DONE_LABEL ...
1843 // Propagate ICC.ZF from CAS above into DONE_LABEL.
1844#endif // _LP64
1845#if INCLUDE_RTM_OPT
1846 } // use_rtm()
1847#endif
1848 // DONE_LABEL is a hot target - we'd really like to place it at the
1849 // start of cache line by padding with NOPs.
1850 // See the AMD and Intel software optimization manuals for the
1851 // most efficient "long" NOP encodings.
1852 // Unfortunately none of our alignment mechanisms suffice.
1853 bind(DONE_LABEL);
1854
1855 // At DONE_LABEL the icc ZFlag is set as follows ...
1856 // Fast_Unlock uses the same protocol.
1857 // ZFlag == 1 -> Success
1858 // ZFlag == 0 -> Failure - force control through the slow-path
1859}
1860
1861// obj: object to unlock
1862// box: box address (displaced header location), killed. Must be EAX.
1863// tmp: killed, cannot be obj nor box.
1864//
1865// Some commentary on balanced locking:
1866//
1867// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1868// Methods that don't have provably balanced locking are forced to run in the
1869// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1870// The interpreter provides two properties:
1871// I1: At return-time the interpreter automatically and quietly unlocks any
1872// objects acquired the current activation (frame). Recall that the
1873// interpreter maintains an on-stack list of locks currently held by
1874// a frame.
1875// I2: If a method attempts to unlock an object that is not held by the
1876// the frame the interpreter throws IMSX.
1877//
1878// Lets say A(), which has provably balanced locking, acquires O and then calls B().
1879// B() doesn't have provably balanced locking so it runs in the interpreter.
1880// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1881// is still locked by A().
1882//
1883// The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1884// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1885// should not be unlocked by "normal" java-level locking and vice-versa. The specification
1886// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1887// Arguably given that the spec legislates the JNI case as undefined our implementation
1888// could reasonably *avoid* checking owner in Fast_Unlock().
1889// In the interest of performance we elide m->Owner==Self check in unlock.
1890// A perfectly viable alternative is to elide the owner check except when
1891// Xcheck:jni is enabled.
1892
1893void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1894 assert(boxReg == rax, "");
1895 assert_different_registers(objReg, boxReg, tmpReg);
1896
1897 Label DONE_LABEL, Stacked, CheckSucc;
1898
1899 // Critically, the biased locking test must have precedence over
1900 // and appear before the (box->dhw == 0) recursive stack-lock test.
1901 if (UseBiasedLocking && !UseOptoBiasInlining) {
1902 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1903 }
1904
1905#if INCLUDE_RTM_OPT
1906 if (UseRTMForStackLocks && use_rtm) {
1907 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1908 Label L_regular_unlock;
1909 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1910 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1911 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
1912 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
1913 xend(); // otherwise end...
1914 jmp(DONE_LABEL); // ... and we're done
1915 bind(L_regular_unlock);
1916 }
1917#endif
1918
1919 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1920 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
1921 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1922 testptr(tmpReg, markOopDesc::monitor_value); // Inflated?
1923 jccb (Assembler::zero, Stacked);
1924
1925 // It's inflated.
1926#if INCLUDE_RTM_OPT
1927 if (use_rtm) {
1928 Label L_regular_inflated_unlock;
1929 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1930 movptr(boxReg, Address(tmpReg, owner_offset));
1931 testptr(boxReg, boxReg);
1932 jccb(Assembler::notZero, L_regular_inflated_unlock);
1933 xend();
1934 jmpb(DONE_LABEL);
1935 bind(L_regular_inflated_unlock);
1936 }
1937#endif
1938
1939 // Despite our balanced locking property we still check that m->_owner == Self
1940 // as java routines or native JNI code called by this thread might
1941 // have released the lock.
1942 // Refer to the comments in synchronizer.cpp for how we might encode extra
1943 // state in _succ so we can avoid fetching EntryList|cxq.
1944 //
1945 // I'd like to add more cases in fast_lock() and fast_unlock() --
1946 // such as recursive enter and exit -- but we have to be wary of
1947 // I$ bloat, T$ effects and BP$ effects.
1948 //
1949 // If there's no contention try a 1-0 exit. That is, exit without
1950 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
1951 // we detect and recover from the race that the 1-0 exit admits.
1952 //
1953 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1954 // before it STs null into _owner, releasing the lock. Updates
1955 // to data protected by the critical section must be visible before
1956 // we drop the lock (and thus before any other thread could acquire
1957 // the lock and observe the fields protected by the lock).
1958 // IA32's memory-model is SPO, so STs are ordered with respect to
1959 // each other and there's no need for an explicit barrier (fence).
1960 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1961#ifndef _LP64
1962 get_thread (boxReg);
1963
1964 // Note that we could employ various encoding schemes to reduce
1965 // the number of loads below (currently 4) to just 2 or 3.
1966 // Refer to the comments in synchronizer.cpp.
1967 // In practice the chain of fetches doesn't seem to impact performance, however.
1968 xorptr(boxReg, boxReg);
1969 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1970 jccb (Assembler::notZero, DONE_LABEL);
1971 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1972 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1973 jccb (Assembler::notZero, CheckSucc);
1974 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1975 jmpb (DONE_LABEL);
1976
1977 bind (Stacked);
1978 // It's not inflated and it's not recursively stack-locked and it's not biased.
1979 // It must be stack-locked.
1980 // Try to reset the header to displaced header.
1981 // The "box" value on the stack is stable, so we can reload
1982 // and be assured we observe the same value as above.
1983 movptr(tmpReg, Address(boxReg, 0));
1984 lock();
1985 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1986 // Intention fall-thru into DONE_LABEL
1987
1988 // DONE_LABEL is a hot target - we'd really like to place it at the
1989 // start of cache line by padding with NOPs.
1990 // See the AMD and Intel software optimization manuals for the
1991 // most efficient "long" NOP encodings.
1992 // Unfortunately none of our alignment mechanisms suffice.
1993 bind (CheckSucc);
1994#else // _LP64
1995 // It's inflated
1996 xorptr(boxReg, boxReg);
1997 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1998 jccb (Assembler::notZero, DONE_LABEL);
1999 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2000 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2001 jccb (Assembler::notZero, CheckSucc);
2002 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2003 jmpb (DONE_LABEL);
2004
2005 // Try to avoid passing control into the slow_path ...
2006 Label LSuccess, LGoSlowPath ;
2007 bind (CheckSucc);
2008
2009 // The following optional optimization can be elided if necessary
2010 // Effectively: if (succ == null) goto SlowPath
2011 // The code reduces the window for a race, however,
2012 // and thus benefits performance.
2013 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2014 jccb (Assembler::zero, LGoSlowPath);
2015
2016 xorptr(boxReg, boxReg);
2017 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2018
2019 // Memory barrier/fence
2020 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2021 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2022 // This is faster on Nehalem and AMD Shanghai/Barcelona.
2023 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2024 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2025 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2026 lock(); addl(Address(rsp, 0), 0);
2027
2028 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2029 jccb (Assembler::notZero, LSuccess);
2030
2031 // Rare inopportune interleaving - race.
2032 // The successor vanished in the small window above.
2033 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2034 // We need to ensure progress and succession.
2035 // Try to reacquire the lock.
2036 // If that fails then the new owner is responsible for succession and this
2037 // thread needs to take no further action and can exit via the fast path (success).
2038 // If the re-acquire succeeds then pass control into the slow path.
2039 // As implemented, this latter mode is horrible because we generated more
2040 // coherence traffic on the lock *and* artifically extended the critical section
2041 // length while by virtue of passing control into the slow path.
2042
2043 // box is really RAX -- the following CMPXCHG depends on that binding
2044 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2045 lock();
2046 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2047 // There's no successor so we tried to regrab the lock.
2048 // If that didn't work, then another thread grabbed the
2049 // lock so we're done (and exit was a success).
2050 jccb (Assembler::notEqual, LSuccess);
2051 // Intentional fall-through into slow-path
2052
2053 bind (LGoSlowPath);
2054 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2055 jmpb (DONE_LABEL);
2056
2057 bind (LSuccess);
2058 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2059 jmpb (DONE_LABEL);
2060
2061 bind (Stacked);
2062 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2063 lock();
2064 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2065
2066#endif
2067 bind(DONE_LABEL);
2068}
2069#endif // COMPILER2
2070
2071void MacroAssembler::c2bool(Register x) {
2072 // implements x == 0 ? 0 : 1
2073 // note: must only look at least-significant byte of x
2074 // since C-style booleans are stored in one byte
2075 // only! (was bug)
2076 andl(x, 0xFF);
2077 setb(Assembler::notZero, x);
2078}
2079
2080// Wouldn't need if AddressLiteral version had new name
2081void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2082 Assembler::call(L, rtype);
2083}
2084
2085void MacroAssembler::call(Register entry) {
2086 Assembler::call(entry);
2087}
2088
2089void MacroAssembler::call(AddressLiteral entry) {
2090 if (reachable(entry)) {
2091 Assembler::call_literal(entry.target(), entry.rspec());
2092 } else {
2093 lea(rscratch1, entry);
2094 Assembler::call(rscratch1);
2095 }
2096}
2097
2098void MacroAssembler::ic_call(address entry, jint method_index) {
2099 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2100 movptr(rax, (intptr_t)Universe::non_oop_word());
2101 call(AddressLiteral(entry, rh));
2102}
2103
2104// Implementation of call_VM versions
2105
2106void MacroAssembler::call_VM(Register oop_result,
2107 address entry_point,
2108 bool check_exceptions) {
2109 Label C, E;
2110 call(C, relocInfo::none);
2111 jmp(E);
2112
2113 bind(C);
2114 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2115 ret(0);
2116
2117 bind(E);
2118}
2119
2120void MacroAssembler::call_VM(Register oop_result,
2121 address entry_point,
2122 Register arg_1,
2123 bool check_exceptions) {
2124 Label C, E;
2125 call(C, relocInfo::none);
2126 jmp(E);
2127
2128 bind(C);
2129 pass_arg1(this, arg_1);
2130 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2131 ret(0);
2132
2133 bind(E);
2134}
2135
2136void MacroAssembler::call_VM(Register oop_result,
2137 address entry_point,
2138 Register arg_1,
2139 Register arg_2,
2140 bool check_exceptions) {
2141 Label C, E;
2142 call(C, relocInfo::none);
2143 jmp(E);
2144
2145 bind(C);
2146
2147 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2148
2149 pass_arg2(this, arg_2);
2150 pass_arg1(this, arg_1);
2151 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2152 ret(0);
2153
2154 bind(E);
2155}
2156
2157void MacroAssembler::call_VM(Register oop_result,
2158 address entry_point,
2159 Register arg_1,
2160 Register arg_2,
2161 Register arg_3,
2162 bool check_exceptions) {
2163 Label C, E;
2164 call(C, relocInfo::none);
2165 jmp(E);
2166
2167 bind(C);
2168
2169 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2170 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2171 pass_arg3(this, arg_3);
2172
2173 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2174 pass_arg2(this, arg_2);
2175
2176 pass_arg1(this, arg_1);
2177 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2178 ret(0);
2179
2180 bind(E);
2181}
2182
2183void MacroAssembler::call_VM(Register oop_result,
2184 Register last_java_sp,
2185 address entry_point,
2186 int number_of_arguments,
2187 bool check_exceptions) {
2188 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2189 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2190}
2191
2192void MacroAssembler::call_VM(Register oop_result,
2193 Register last_java_sp,
2194 address entry_point,
2195 Register arg_1,
2196 bool check_exceptions) {
2197 pass_arg1(this, arg_1);
2198 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2199}
2200
2201void MacroAssembler::call_VM(Register oop_result,
2202 Register last_java_sp,
2203 address entry_point,
2204 Register arg_1,
2205 Register arg_2,
2206 bool check_exceptions) {
2207
2208 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2209 pass_arg2(this, arg_2);
2210 pass_arg1(this, arg_1);
2211 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2212}
2213
2214void MacroAssembler::call_VM(Register oop_result,
2215 Register last_java_sp,
2216 address entry_point,
2217 Register arg_1,
2218 Register arg_2,
2219 Register arg_3,
2220 bool check_exceptions) {
2221 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2222 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2223 pass_arg3(this, arg_3);
2224 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2225 pass_arg2(this, arg_2);
2226 pass_arg1(this, arg_1);
2227 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2228}
2229
2230void MacroAssembler::super_call_VM(Register oop_result,
2231 Register last_java_sp,
2232 address entry_point,
2233 int number_of_arguments,
2234 bool check_exceptions) {
2235 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2236 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2237}
2238
2239void MacroAssembler::super_call_VM(Register oop_result,
2240 Register last_java_sp,
2241 address entry_point,
2242 Register arg_1,
2243 bool check_exceptions) {
2244 pass_arg1(this, arg_1);
2245 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2246}
2247
2248void MacroAssembler::super_call_VM(Register oop_result,
2249 Register last_java_sp,
2250 address entry_point,
2251 Register arg_1,
2252 Register arg_2,
2253 bool check_exceptions) {
2254
2255 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2256 pass_arg2(this, arg_2);
2257 pass_arg1(this, arg_1);
2258 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2259}
2260
2261void MacroAssembler::super_call_VM(Register oop_result,
2262 Register last_java_sp,
2263 address entry_point,
2264 Register arg_1,
2265 Register arg_2,
2266 Register arg_3,
2267 bool check_exceptions) {
2268 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2269 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2270 pass_arg3(this, arg_3);
2271 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2272 pass_arg2(this, arg_2);
2273 pass_arg1(this, arg_1);
2274 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2275}
2276
2277void MacroAssembler::call_VM_base(Register oop_result,
2278 Register java_thread,
2279 Register last_java_sp,
2280 address entry_point,
2281 int number_of_arguments,
2282 bool check_exceptions) {
2283 // determine java_thread register
2284 if (!java_thread->is_valid()) {
2285#ifdef _LP64
2286 java_thread = r15_thread;
2287#else
2288 java_thread = rdi;
2289 get_thread(java_thread);
2290#endif // LP64
2291 }
2292 // determine last_java_sp register
2293 if (!last_java_sp->is_valid()) {
2294 last_java_sp = rsp;
2295 }
2296 // debugging support
2297 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
2298 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2299#ifdef ASSERT
2300 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2301 // r12 is the heapbase.
2302 LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2303#endif // ASSERT
2304
2305 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
2306 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2307
2308 // push java thread (becomes first argument of C function)
2309
2310 NOT_LP64(push(java_thread); number_of_arguments++);
2311 LP64_ONLY(mov(c_rarg0, r15_thread));
2312
2313 // set last Java frame before call
2314 assert(last_java_sp != rbp, "can't use ebp/rbp");
2315
2316 // Only interpreter should have to set fp
2317 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2318
2319 // do the call, remove parameters
2320 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2321
2322 // restore the thread (cannot use the pushed argument since arguments
2323 // may be overwritten by C code generated by an optimizing compiler);
2324 // however can use the register value directly if it is callee saved.
2325 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2326 // rdi & rsi (also r15) are callee saved -> nothing to do
2327#ifdef ASSERT
2328 guarantee(java_thread != rax, "change this code");
2329 push(rax);
2330 { Label L;
2331 get_thread(rax);
2332 cmpptr(java_thread, rax);
2333 jcc(Assembler::equal, L);
2334 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2335 bind(L);
2336 }
2337 pop(rax);
2338#endif
2339 } else {
2340 get_thread(java_thread);
2341 }
2342 // reset last Java frame
2343 // Only interpreter should have to clear fp
2344 reset_last_Java_frame(java_thread, true);
2345
2346 // C++ interp handles this in the interpreter
2347 check_and_handle_popframe(java_thread);
2348 check_and_handle_earlyret(java_thread);
2349
2350 if (check_exceptions) {
2351 // check for pending exceptions (java_thread is set upon return)
2352 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2353#ifndef _LP64
2354 jump_cc(Assembler::notEqual,
2355 RuntimeAddress(StubRoutines::forward_exception_entry()));
2356#else
2357 // This used to conditionally jump to forward_exception however it is
2358 // possible if we relocate that the branch will not reach. So we must jump
2359 // around so we can always reach
2360
2361 Label ok;
2362 jcc(Assembler::equal, ok);
2363 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2364 bind(ok);
2365#endif // LP64
2366 }
2367
2368 // get oop result if there is one and reset the value in the thread
2369 if (oop_result->is_valid()) {
2370 get_vm_result(oop_result, java_thread);
2371 }
2372}
2373
2374void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2375
2376 // Calculate the value for last_Java_sp
2377 // somewhat subtle. call_VM does an intermediate call
2378 // which places a return address on the stack just under the
2379 // stack pointer as the user finsihed with it. This allows
2380 // use to retrieve last_Java_pc from last_Java_sp[-1].
2381 // On 32bit we then have to push additional args on the stack to accomplish
2382 // the actual requested call. On 64bit call_VM only can use register args
2383 // so the only extra space is the return address that call_VM created.
2384 // This hopefully explains the calculations here.
2385
2386#ifdef _LP64
2387 // We've pushed one address, correct last_Java_sp
2388 lea(rax, Address(rsp, wordSize));
2389#else
2390 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2391#endif // LP64
2392
2393 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2394
2395}
2396
2397// Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2398void MacroAssembler::call_VM_leaf0(address entry_point) {
2399 MacroAssembler::call_VM_leaf_base(entry_point, 0);
2400}
2401
2402void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2403 call_VM_leaf_base(entry_point, number_of_arguments);
2404}
2405
2406void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2407 pass_arg0(this, arg_0);
2408 call_VM_leaf(entry_point, 1);
2409}
2410
2411void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2412
2413 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2414 pass_arg1(this, arg_1);
2415 pass_arg0(this, arg_0);
2416 call_VM_leaf(entry_point, 2);
2417}
2418
2419void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2420 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2421 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2422 pass_arg2(this, arg_2);
2423 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2424 pass_arg1(this, arg_1);
2425 pass_arg0(this, arg_0);
2426 call_VM_leaf(entry_point, 3);
2427}
2428
2429void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2430 pass_arg0(this, arg_0);
2431 MacroAssembler::call_VM_leaf_base(entry_point, 1);
2432}
2433
2434void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2435
2436 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2437 pass_arg1(this, arg_1);
2438 pass_arg0(this, arg_0);
2439 MacroAssembler::call_VM_leaf_base(entry_point, 2);
2440}
2441
2442void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2443 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2444 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2445 pass_arg2(this, arg_2);
2446 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2447 pass_arg1(this, arg_1);
2448 pass_arg0(this, arg_0);
2449 MacroAssembler::call_VM_leaf_base(entry_point, 3);
2450}
2451
2452void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2453 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2454 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2455 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2456 pass_arg3(this, arg_3);
2457 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2458 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2459 pass_arg2(this, arg_2);
2460 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2461 pass_arg1(this, arg_1);
2462 pass_arg0(this, arg_0);
2463 MacroAssembler::call_VM_leaf_base(entry_point, 4);
2464}
2465
2466void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2467 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2468 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2469 verify_oop(oop_result, "broken oop in call_VM_base");
2470}
2471
2472void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2473 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2474 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2475}
2476
2477void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2478}
2479
2480void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2481}
2482
2483void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2484 if (reachable(src1)) {
2485 cmpl(as_Address(src1), imm);
2486 } else {
2487 lea(rscratch1, src1);
2488 cmpl(Address(rscratch1, 0), imm);
2489 }
2490}
2491
2492void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2493 assert(!src2.is_lval(), "use cmpptr");
2494 if (reachable(src2)) {
2495 cmpl(src1, as_Address(src2));
2496 } else {
2497 lea(rscratch1, src2);
2498 cmpl(src1, Address(rscratch1, 0));
2499 }
2500}
2501
2502void MacroAssembler::cmp32(Register src1, int32_t imm) {
2503 Assembler::cmpl(src1, imm);
2504}
2505
2506void MacroAssembler::cmp32(Register src1, Address src2) {
2507 Assembler::cmpl(src1, src2);
2508}
2509
2510void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2511 ucomisd(opr1, opr2);
2512
2513 Label L;
2514 if (unordered_is_less) {
2515 movl(dst, -1);
2516 jcc(Assembler::parity, L);
2517 jcc(Assembler::below , L);
2518 movl(dst, 0);
2519 jcc(Assembler::equal , L);
2520 increment(dst);
2521 } else { // unordered is greater
2522 movl(dst, 1);
2523 jcc(Assembler::parity, L);
2524 jcc(Assembler::above , L);
2525 movl(dst, 0);
2526 jcc(Assembler::equal , L);
2527 decrementl(dst);
2528 }
2529 bind(L);
2530}
2531
2532void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2533 ucomiss(opr1, opr2);
2534
2535 Label L;
2536 if (unordered_is_less) {
2537 movl(dst, -1);
2538 jcc(Assembler::parity, L);
2539 jcc(Assembler::below , L);
2540 movl(dst, 0);
2541 jcc(Assembler::equal , L);
2542 increment(dst);
2543 } else { // unordered is greater
2544 movl(dst, 1);
2545 jcc(Assembler::parity, L);
2546 jcc(Assembler::above , L);
2547 movl(dst, 0);
2548 jcc(Assembler::equal , L);
2549 decrementl(dst);
2550 }
2551 bind(L);
2552}
2553
2554
2555void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2556 if (reachable(src1)) {
2557 cmpb(as_Address(src1), imm);
2558 } else {
2559 lea(rscratch1, src1);
2560 cmpb(Address(rscratch1, 0), imm);
2561 }
2562}
2563
2564void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2565#ifdef _LP64
2566 if (src2.is_lval()) {
2567 movptr(rscratch1, src2);
2568 Assembler::cmpq(src1, rscratch1);
2569 } else if (reachable(src2)) {
2570 cmpq(src1, as_Address(src2));
2571 } else {
2572 lea(rscratch1, src2);
2573 Assembler::cmpq(src1, Address(rscratch1, 0));
2574 }
2575#else
2576 if (src2.is_lval()) {
2577 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2578 } else {
2579 cmpl(src1, as_Address(src2));
2580 }
2581#endif // _LP64
2582}
2583
2584void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2585 assert(src2.is_lval(), "not a mem-mem compare");
2586#ifdef _LP64
2587 // moves src2's literal address
2588 movptr(rscratch1, src2);
2589 Assembler::cmpq(src1, rscratch1);
2590#else
2591 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2592#endif // _LP64
2593}
2594
2595void MacroAssembler::cmpoop(Register src1, Register src2) {
2596 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2597 bs->obj_equals(this, src1, src2);
2598}
2599
2600void MacroAssembler::cmpoop(Register src1, Address src2) {
2601 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2602 bs->obj_equals(this, src1, src2);
2603}
2604
2605#ifdef _LP64
2606void MacroAssembler::cmpoop(Register src1, jobject src2) {
2607 movoop(rscratch1, src2);
2608 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2609 bs->obj_equals(this, src1, rscratch1);
2610}
2611#endif
2612
2613void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2614 if (reachable(adr)) {
2615 lock();
2616 cmpxchgptr(reg, as_Address(adr));
2617 } else {
2618 lea(rscratch1, adr);
2619 lock();
2620 cmpxchgptr(reg, Address(rscratch1, 0));
2621 }
2622}
2623
2624void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2625 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2626}
2627
2628void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2629 if (reachable(src)) {
2630 Assembler::comisd(dst, as_Address(src));
2631 } else {
2632 lea(rscratch1, src);
2633 Assembler::comisd(dst, Address(rscratch1, 0));
2634 }
2635}
2636
2637void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2638 if (reachable(src)) {
2639 Assembler::comiss(dst, as_Address(src));
2640 } else {
2641 lea(rscratch1, src);
2642 Assembler::comiss(dst, Address(rscratch1, 0));
2643 }
2644}
2645
2646
2647void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2648 Condition negated_cond = negate_condition(cond);
2649 Label L;
2650 jcc(negated_cond, L);
2651 pushf(); // Preserve flags
2652 atomic_incl(counter_addr);
2653 popf();
2654 bind(L);
2655}
2656
2657int MacroAssembler::corrected_idivl(Register reg) {
2658 // Full implementation of Java idiv and irem; checks for
2659 // special case as described in JVM spec., p.243 & p.271.
2660 // The function returns the (pc) offset of the idivl
2661 // instruction - may be needed for implicit exceptions.
2662 //
2663 // normal case special case
2664 //
2665 // input : rax,: dividend min_int
2666 // reg: divisor (may not be rax,/rdx) -1
2667 //
2668 // output: rax,: quotient (= rax, idiv reg) min_int
2669 // rdx: remainder (= rax, irem reg) 0
2670 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2671 const int min_int = 0x80000000;
2672 Label normal_case, special_case;
2673
2674 // check for special case
2675 cmpl(rax, min_int);
2676 jcc(Assembler::notEqual, normal_case);
2677 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2678 cmpl(reg, -1);
2679 jcc(Assembler::equal, special_case);
2680
2681 // handle normal case
2682 bind(normal_case);
2683 cdql();
2684 int idivl_offset = offset();
2685 idivl(reg);
2686
2687 // normal and special case exit
2688 bind(special_case);
2689
2690 return idivl_offset;
2691}
2692
2693
2694
2695void MacroAssembler::decrementl(Register reg, int value) {
2696 if (value == min_jint) {subl(reg, value) ; return; }
2697 if (value < 0) { incrementl(reg, -value); return; }
2698 if (value == 0) { ; return; }
2699 if (value == 1 && UseIncDec) { decl(reg) ; return; }
2700 /* else */ { subl(reg, value) ; return; }
2701}
2702
2703void MacroAssembler::decrementl(Address dst, int value) {
2704 if (value == min_jint) {subl(dst, value) ; return; }
2705 if (value < 0) { incrementl(dst, -value); return; }
2706 if (value == 0) { ; return; }
2707 if (value == 1 && UseIncDec) { decl(dst) ; return; }
2708 /* else */ { subl(dst, value) ; return; }
2709}
2710
2711void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2712 assert (shift_value > 0, "illegal shift value");
2713 Label _is_positive;
2714 testl (reg, reg);
2715 jcc (Assembler::positive, _is_positive);
2716 int offset = (1 << shift_value) - 1 ;
2717
2718 if (offset == 1) {
2719 incrementl(reg);
2720 } else {
2721 addl(reg, offset);
2722 }
2723
2724 bind (_is_positive);
2725 sarl(reg, shift_value);
2726}
2727
2728void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2729 if (reachable(src)) {
2730 Assembler::divsd(dst, as_Address(src));
2731 } else {
2732 lea(rscratch1, src);
2733 Assembler::divsd(dst, Address(rscratch1, 0));
2734 }
2735}
2736
2737void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2738 if (reachable(src)) {
2739 Assembler::divss(dst, as_Address(src));
2740 } else {
2741 lea(rscratch1, src);
2742 Assembler::divss(dst, Address(rscratch1, 0));
2743 }
2744}
2745
2746// !defined(COMPILER2) is because of stupid core builds
2747#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2748void MacroAssembler::empty_FPU_stack() {
2749 if (VM_Version::supports_mmx()) {
2750 emms();
2751 } else {
2752 for (int i = 8; i-- > 0; ) ffree(i);
2753 }
2754}
2755#endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2756
2757
2758void MacroAssembler::enter() {
2759 push(rbp);
2760 mov(rbp, rsp);
2761}
2762
2763// A 5 byte nop that is safe for patching (see patch_verified_entry)
2764void MacroAssembler::fat_nop() {
2765 if (UseAddressNop) {
2766 addr_nop_5();
2767 } else {
2768 emit_int8(0x26); // es:
2769 emit_int8(0x2e); // cs:
2770 emit_int8(0x64); // fs:
2771 emit_int8(0x65); // gs:
2772 emit_int8((unsigned char)0x90);
2773 }
2774}
2775
2776void MacroAssembler::fcmp(Register tmp) {
2777 fcmp(tmp, 1, true, true);
2778}
2779
2780void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2781 assert(!pop_right || pop_left, "usage error");
2782 if (VM_Version::supports_cmov()) {
2783 assert(tmp == noreg, "unneeded temp");
2784 if (pop_left) {
2785 fucomip(index);
2786 } else {
2787 fucomi(index);
2788 }
2789 if (pop_right) {
2790 fpop();
2791 }
2792 } else {
2793 assert(tmp != noreg, "need temp");
2794 if (pop_left) {
2795 if (pop_right) {
2796 fcompp();
2797 } else {
2798 fcomp(index);
2799 }
2800 } else {
2801 fcom(index);
2802 }
2803 // convert FPU condition into eflags condition via rax,
2804 save_rax(tmp);
2805 fwait(); fnstsw_ax();
2806 sahf();
2807 restore_rax(tmp);
2808 }
2809 // condition codes set as follows:
2810 //
2811 // CF (corresponds to C0) if x < y
2812 // PF (corresponds to C2) if unordered
2813 // ZF (corresponds to C3) if x = y
2814}
2815
2816void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2817 fcmp2int(dst, unordered_is_less, 1, true, true);
2818}
2819
2820void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2821 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2822 Label L;
2823 if (unordered_is_less) {
2824 movl(dst, -1);
2825 jcc(Assembler::parity, L);
2826 jcc(Assembler::below , L);
2827 movl(dst, 0);
2828 jcc(Assembler::equal , L);
2829 increment(dst);
2830 } else { // unordered is greater
2831 movl(dst, 1);
2832 jcc(Assembler::parity, L);
2833 jcc(Assembler::above , L);
2834 movl(dst, 0);
2835 jcc(Assembler::equal , L);
2836 decrementl(dst);
2837 }
2838 bind(L);
2839}
2840
2841void MacroAssembler::fld_d(AddressLiteral src) {
2842 fld_d(as_Address(src));
2843}
2844
2845void MacroAssembler::fld_s(AddressLiteral src) {
2846 fld_s(as_Address(src));
2847}
2848
2849void MacroAssembler::fld_x(AddressLiteral src) {
2850 Assembler::fld_x(as_Address(src));
2851}
2852
2853void MacroAssembler::fldcw(AddressLiteral src) {
2854 Assembler::fldcw(as_Address(src));
2855}
2856
2857void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2858 if (reachable(src)) {
2859 Assembler::mulpd(dst, as_Address(src));
2860 } else {
2861 lea(rscratch1, src);
2862 Assembler::mulpd(dst, Address(rscratch1, 0));
2863 }
2864}
2865
2866void MacroAssembler::increase_precision() {
2867 subptr(rsp, BytesPerWord);
2868 fnstcw(Address(rsp, 0));
2869 movl(rax, Address(rsp, 0));
2870 orl(rax, 0x300);
2871 push(rax);
2872 fldcw(Address(rsp, 0));
2873 pop(rax);
2874}
2875
2876void MacroAssembler::restore_precision() {
2877 fldcw(Address(rsp, 0));
2878 addptr(rsp, BytesPerWord);
2879}
2880
2881void MacroAssembler::fpop() {
2882 ffree();
2883 fincstp();
2884}
2885
2886void MacroAssembler::load_float(Address src) {
2887 if (UseSSE >= 1) {
2888 movflt(xmm0, src);
2889 } else {
2890 LP64_ONLY(ShouldNotReachHere());
2891 NOT_LP64(fld_s(src));
2892 }
2893}
2894
2895void MacroAssembler::store_float(Address dst) {
2896 if (UseSSE >= 1) {
2897 movflt(dst, xmm0);
2898 } else {
2899 LP64_ONLY(ShouldNotReachHere());
2900 NOT_LP64(fstp_s(dst));
2901 }
2902}
2903
2904void MacroAssembler::load_double(Address src) {
2905 if (UseSSE >= 2) {
2906 movdbl(xmm0, src);
2907 } else {
2908 LP64_ONLY(ShouldNotReachHere());
2909 NOT_LP64(fld_d(src));
2910 }
2911}
2912
2913void MacroAssembler::store_double(Address dst) {
2914 if (UseSSE >= 2) {
2915 movdbl(dst, xmm0);
2916 } else {
2917 LP64_ONLY(ShouldNotReachHere());
2918 NOT_LP64(fstp_d(dst));
2919 }
2920}
2921
2922void MacroAssembler::fremr(Register tmp) {
2923 save_rax(tmp);
2924 { Label L;
2925 bind(L);
2926 fprem();
2927 fwait(); fnstsw_ax();
2928#ifdef _LP64
2929 testl(rax, 0x400);
2930 jcc(Assembler::notEqual, L);
2931#else
2932 sahf();
2933 jcc(Assembler::parity, L);
2934#endif // _LP64
2935 }
2936 restore_rax(tmp);
2937 // Result is in ST0.
2938 // Note: fxch & fpop to get rid of ST1
2939 // (otherwise FPU stack could overflow eventually)
2940 fxch(1);
2941 fpop();
2942}
2943
2944// dst = c = a * b + c
2945void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2946 Assembler::vfmadd231sd(c, a, b);
2947 if (dst != c) {
2948 movdbl(dst, c);
2949 }
2950}
2951
2952// dst = c = a * b + c
2953void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2954 Assembler::vfmadd231ss(c, a, b);
2955 if (dst != c) {
2956 movflt(dst, c);
2957 }
2958}
2959
2960// dst = c = a * b + c
2961void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2962 Assembler::vfmadd231pd(c, a, b, vector_len);
2963 if (dst != c) {
2964 vmovdqu(dst, c);
2965 }
2966}
2967
2968// dst = c = a * b + c
2969void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2970 Assembler::vfmadd231ps(c, a, b, vector_len);
2971 if (dst != c) {
2972 vmovdqu(dst, c);
2973 }
2974}
2975
2976// dst = c = a * b + c
2977void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2978 Assembler::vfmadd231pd(c, a, b, vector_len);
2979 if (dst != c) {
2980 vmovdqu(dst, c);
2981 }
2982}
2983
2984// dst = c = a * b + c
2985void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2986 Assembler::vfmadd231ps(c, a, b, vector_len);
2987 if (dst != c) {
2988 vmovdqu(dst, c);
2989 }
2990}
2991
2992void MacroAssembler::incrementl(AddressLiteral dst) {
2993 if (reachable(dst)) {
2994 incrementl(as_Address(dst));
2995 } else {
2996 lea(rscratch1, dst);
2997 incrementl(Address(rscratch1, 0));
2998 }
2999}
3000
3001void MacroAssembler::incrementl(ArrayAddress dst) {
3002 incrementl(as_Address(dst));
3003}
3004
3005void MacroAssembler::incrementl(Register reg, int value) {
3006 if (value == min_jint) {addl(reg, value) ; return; }
3007 if (value < 0) { decrementl(reg, -value); return; }
3008 if (value == 0) { ; return; }
3009 if (value == 1 && UseIncDec) { incl(reg) ; return; }
3010 /* else */ { addl(reg, value) ; return; }
3011}
3012
3013void MacroAssembler::incrementl(Address dst, int value) {
3014 if (value == min_jint) {addl(dst, value) ; return; }
3015 if (value < 0) { decrementl(dst, -value); return; }
3016 if (value == 0) { ; return; }
3017 if (value == 1 && UseIncDec) { incl(dst) ; return; }
3018 /* else */ { addl(dst, value) ; return; }
3019}
3020
3021void MacroAssembler::jump(AddressLiteral dst) {
3022 if (reachable(dst)) {
3023 jmp_literal(dst.target(), dst.rspec());
3024 } else {
3025 lea(rscratch1, dst);
3026 jmp(rscratch1);
3027 }
3028}
3029
3030void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3031 if (reachable(dst)) {
3032 InstructionMark im(this);
3033 relocate(dst.reloc());
3034 const int short_size = 2;
3035 const int long_size = 6;
3036 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3037 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3038 // 0111 tttn #8-bit disp
3039 emit_int8(0x70 | cc);
3040 emit_int8((offs - short_size) & 0xFF);
3041 } else {
3042 // 0000 1111 1000 tttn #32-bit disp
3043 emit_int8(0x0F);
3044 emit_int8((unsigned char)(0x80 | cc));
3045 emit_int32(offs - long_size);
3046 }
3047 } else {
3048#ifdef ASSERT
3049 warning("reversing conditional branch");
3050#endif /* ASSERT */
3051 Label skip;
3052 jccb(reverse[cc], skip);
3053 lea(rscratch1, dst);
3054 Assembler::jmp(rscratch1);
3055 bind(skip);
3056 }
3057}
3058
3059void MacroAssembler::ldmxcsr(AddressLiteral src) {
3060 if (reachable(src)) {
3061 Assembler::ldmxcsr(as_Address(src));
3062 } else {
3063 lea(rscratch1, src);
3064 Assembler::ldmxcsr(Address(rscratch1, 0));
3065 }
3066}
3067
3068int MacroAssembler::load_signed_byte(Register dst, Address src) {
3069 int off;
3070 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3071 off = offset();
3072 movsbl(dst, src); // movsxb
3073 } else {
3074 off = load_unsigned_byte(dst, src);
3075 shll(dst, 24);
3076 sarl(dst, 24);
3077 }
3078 return off;
3079}
3080
3081// Note: load_signed_short used to be called load_signed_word.
3082// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3083// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3084// The term "word" in HotSpot means a 32- or 64-bit machine word.
3085int MacroAssembler::load_signed_short(Register dst, Address src) {
3086 int off;
3087 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3088 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3089 // version but this is what 64bit has always done. This seems to imply
3090 // that users are only using 32bits worth.
3091 off = offset();
3092 movswl(dst, src); // movsxw
3093 } else {
3094 off = load_unsigned_short(dst, src);
3095 shll(dst, 16);
3096 sarl(dst, 16);
3097 }
3098 return off;
3099}
3100
3101int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3102 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3103 // and "3.9 Partial Register Penalties", p. 22).
3104 int off;
3105 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3106 off = offset();
3107 movzbl(dst, src); // movzxb
3108 } else {
3109 xorl(dst, dst);
3110 off = offset();
3111 movb(dst, src);
3112 }
3113 return off;
3114}
3115
3116// Note: load_unsigned_short used to be called load_unsigned_word.
3117int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3118 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3119 // and "3.9 Partial Register Penalties", p. 22).
3120 int off;
3121 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3122 off = offset();
3123 movzwl(dst, src); // movzxw
3124 } else {
3125 xorl(dst, dst);
3126 off = offset();
3127 movw(dst, src);
3128 }
3129 return off;
3130}
3131
3132void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3133 switch (size_in_bytes) {
3134#ifndef _LP64
3135 case 8:
3136 assert(dst2 != noreg, "second dest register required");
3137 movl(dst, src);
3138 movl(dst2, src.plus_disp(BytesPerInt));
3139 break;
3140#else
3141 case 8: movq(dst, src); break;
3142#endif
3143 case 4: movl(dst, src); break;
3144 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3145 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3146 default: ShouldNotReachHere();
3147 }
3148}
3149
3150void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3151 switch (size_in_bytes) {
3152#ifndef _LP64
3153 case 8:
3154 assert(src2 != noreg, "second source register required");
3155 movl(dst, src);
3156 movl(dst.plus_disp(BytesPerInt), src2);
3157 break;
3158#else
3159 case 8: movq(dst, src); break;
3160#endif
3161 case 4: movl(dst, src); break;
3162 case 2: movw(dst, src); break;
3163 case 1: movb(dst, src); break;
3164 default: ShouldNotReachHere();
3165 }
3166}
3167
3168void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3169 if (reachable(dst)) {
3170 movl(as_Address(dst), src);
3171 } else {
3172 lea(rscratch1, dst);
3173 movl(Address(rscratch1, 0), src);
3174 }
3175}
3176
3177void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3178 if (reachable(src)) {
3179 movl(dst, as_Address(src));
3180 } else {
3181 lea(rscratch1, src);
3182 movl(dst, Address(rscratch1, 0));
3183 }
3184}
3185
3186// C++ bool manipulation
3187
3188void MacroAssembler::movbool(Register dst, Address src) {
3189 if(sizeof(bool) == 1)
3190 movb(dst, src);
3191 else if(sizeof(bool) == 2)
3192 movw(dst, src);
3193 else if(sizeof(bool) == 4)
3194 movl(dst, src);
3195 else
3196 // unsupported
3197 ShouldNotReachHere();
3198}
3199
3200void MacroAssembler::movbool(Address dst, bool boolconst) {
3201 if(sizeof(bool) == 1)
3202 movb(dst, (int) boolconst);
3203 else if(sizeof(bool) == 2)
3204 movw(dst, (int) boolconst);
3205 else if(sizeof(bool) == 4)
3206 movl(dst, (int) boolconst);
3207 else
3208 // unsupported
3209 ShouldNotReachHere();
3210}
3211
3212void MacroAssembler::movbool(Address dst, Register src) {
3213 if(sizeof(bool) == 1)
3214 movb(dst, src);
3215 else if(sizeof(bool) == 2)
3216 movw(dst, src);
3217 else if(sizeof(bool) == 4)
3218 movl(dst, src);
3219 else
3220 // unsupported
3221 ShouldNotReachHere();
3222}
3223
3224void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3225 movb(as_Address(dst), src);
3226}
3227
3228void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3229 if (reachable(src)) {
3230 movdl(dst, as_Address(src));
3231 } else {
3232 lea(rscratch1, src);
3233 movdl(dst, Address(rscratch1, 0));
3234 }
3235}
3236
3237void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3238 if (reachable(src)) {
3239 movq(dst, as_Address(src));
3240 } else {
3241 lea(rscratch1, src);
3242 movq(dst, Address(rscratch1, 0));
3243 }
3244}
3245
3246#ifdef COMPILER2
3247void MacroAssembler::setvectmask(Register dst, Register src) {
3248 guarantee(PostLoopMultiversioning, "must be");
3249 Assembler::movl(dst, 1);
3250 Assembler::shlxl(dst, dst, src);
3251 Assembler::decl(dst);
3252 Assembler::kmovdl(k1, dst);
3253 Assembler::movl(dst, src);
3254}
3255
3256void MacroAssembler::restorevectmask() {
3257 guarantee(PostLoopMultiversioning, "must be");
3258 Assembler::knotwl(k1, k0);
3259}
3260#endif // COMPILER2
3261
3262void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3263 if (reachable(src)) {
3264 if (UseXmmLoadAndClearUpper) {
3265 movsd (dst, as_Address(src));
3266 } else {
3267 movlpd(dst, as_Address(src));
3268 }
3269 } else {
3270 lea(rscratch1, src);
3271 if (UseXmmLoadAndClearUpper) {
3272 movsd (dst, Address(rscratch1, 0));
3273 } else {
3274 movlpd(dst, Address(rscratch1, 0));
3275 }
3276 }
3277}
3278
3279void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3280 if (reachable(src)) {
3281 movss(dst, as_Address(src));
3282 } else {
3283 lea(rscratch1, src);
3284 movss(dst, Address(rscratch1, 0));
3285 }
3286}
3287
3288void MacroAssembler::movptr(Register dst, Register src) {
3289 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3290}
3291
3292void MacroAssembler::movptr(Register dst, Address src) {
3293 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3294}
3295
3296// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3297void MacroAssembler::movptr(Register dst, intptr_t src) {
3298 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3299}
3300
3301void MacroAssembler::movptr(Address dst, Register src) {
3302 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3303}
3304
3305void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3306 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3307 Assembler::movdqu(dst, src);
3308}
3309
3310void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3311 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3312 Assembler::movdqu(dst, src);
3313}
3314
3315void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3316 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3317 Assembler::movdqu(dst, src);
3318}
3319
3320void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3321 if (reachable(src)) {
3322 movdqu(dst, as_Address(src));
3323 } else {
3324 lea(scratchReg, src);
3325 movdqu(dst, Address(scratchReg, 0));
3326 }
3327}
3328
3329void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3330 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3331 Assembler::vmovdqu(dst, src);
3332}
3333
3334void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3335 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3336 Assembler::vmovdqu(dst, src);
3337}
3338
3339void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3340 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3341 Assembler::vmovdqu(dst, src);
3342}
3343
3344void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3345 if (reachable(src)) {
3346 vmovdqu(dst, as_Address(src));
3347 }
3348 else {
3349 lea(scratch_reg, src);
3350 vmovdqu(dst, Address(scratch_reg, 0));
3351 }
3352}
3353
3354void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3355 if (reachable(src)) {
3356 Assembler::evmovdquq(dst, as_Address(src), vector_len);
3357 } else {
3358 lea(rscratch, src);
3359 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3360 }
3361}
3362
3363void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3364 if (reachable(src)) {
3365 Assembler::movdqa(dst, as_Address(src));
3366 } else {
3367 lea(rscratch1, src);
3368 Assembler::movdqa(dst, Address(rscratch1, 0));
3369 }
3370}
3371
3372void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3373 if (reachable(src)) {
3374 Assembler::movsd(dst, as_Address(src));
3375 } else {
3376 lea(rscratch1, src);
3377 Assembler::movsd(dst, Address(rscratch1, 0));
3378 }
3379}
3380
3381void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3382 if (reachable(src)) {
3383 Assembler::movss(dst, as_Address(src));
3384 } else {
3385 lea(rscratch1, src);
3386 Assembler::movss(dst, Address(rscratch1, 0));
3387 }
3388}
3389
3390void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3391 if (reachable(src)) {
3392 Assembler::mulsd(dst, as_Address(src));
3393 } else {
3394 lea(rscratch1, src);
3395 Assembler::mulsd(dst, Address(rscratch1, 0));
3396 }
3397}
3398
3399void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3400 if (reachable(src)) {
3401 Assembler::mulss(dst, as_Address(src));
3402 } else {
3403 lea(rscratch1, src);
3404 Assembler::mulss(dst, Address(rscratch1, 0));
3405 }
3406}
3407
3408void MacroAssembler::null_check(Register reg, int offset) {
3409 if (needs_explicit_null_check(offset)) {
3410 // provoke OS NULL exception if reg = NULL by
3411 // accessing M[reg] w/o changing any (non-CC) registers
3412 // NOTE: cmpl is plenty here to provoke a segv
3413 cmpptr(rax, Address(reg, 0));
3414 // Note: should probably use testl(rax, Address(reg, 0));
3415 // may be shorter code (however, this version of
3416 // testl needs to be implemented first)
3417 } else {
3418 // nothing to do, (later) access of M[reg + offset]
3419 // will provoke OS NULL exception if reg = NULL
3420 }
3421}
3422
3423void MacroAssembler::os_breakpoint() {
3424 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3425 // (e.g., MSVC can't call ps() otherwise)
3426 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3427}
3428
3429void MacroAssembler::unimplemented(const char* what) {
3430 const char* buf = NULL;
3431 {
3432 ResourceMark rm;
3433 stringStream ss;
3434 ss.print("unimplemented: %s", what);
3435 buf = code_string(ss.as_string());
3436 }
3437 stop(buf);
3438}
3439
3440#ifdef _LP64
3441#define XSTATE_BV 0x200
3442#endif
3443
3444void MacroAssembler::pop_CPU_state() {
3445 pop_FPU_state();
3446 pop_IU_state();
3447}
3448
3449void MacroAssembler::pop_FPU_state() {
3450#ifndef _LP64
3451 frstor(Address(rsp, 0));
3452#else
3453 fxrstor(Address(rsp, 0));
3454#endif
3455 addptr(rsp, FPUStateSizeInWords * wordSize);
3456}
3457
3458void MacroAssembler::pop_IU_state() {
3459 popa();
3460 LP64_ONLY(addq(rsp, 8));
3461 popf();
3462}
3463
3464// Save Integer and Float state
3465// Warning: Stack must be 16 byte aligned (64bit)
3466void MacroAssembler::push_CPU_state() {
3467 push_IU_state();
3468 push_FPU_state();
3469}
3470
3471void MacroAssembler::push_FPU_state() {
3472 subptr(rsp, FPUStateSizeInWords * wordSize);
3473#ifndef _LP64
3474 fnsave(Address(rsp, 0));
3475 fwait();
3476#else
3477 fxsave(Address(rsp, 0));
3478#endif // LP64
3479}
3480
3481void MacroAssembler::push_IU_state() {
3482 // Push flags first because pusha kills them
3483 pushf();
3484 // Make sure rsp stays 16-byte aligned
3485 LP64_ONLY(subq(rsp, 8));
3486 pusha();
3487}
3488
3489void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3490 if (!java_thread->is_valid()) {
3491 java_thread = rdi;
3492 get_thread(java_thread);
3493 }
3494 // we must set sp to zero to clear frame
3495 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3496 if (clear_fp) {
3497 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3498 }
3499
3500 // Always clear the pc because it could have been set by make_walkable()
3501 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3502
3503 vzeroupper();
3504}
3505
3506void MacroAssembler::restore_rax(Register tmp) {
3507 if (tmp == noreg) pop(rax);
3508 else if (tmp != rax) mov(rax, tmp);
3509}
3510
3511void MacroAssembler::round_to(Register reg, int modulus) {
3512 addptr(reg, modulus - 1);
3513 andptr(reg, -modulus);
3514}
3515
3516void MacroAssembler::save_rax(Register tmp) {
3517 if (tmp == noreg) push(rax);
3518 else if (tmp != rax) mov(tmp, rax);
3519}
3520
3521void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3522 if (SafepointMechanism::uses_thread_local_poll()) {
3523#ifdef _LP64
3524 assert(thread_reg == r15_thread, "should be");
3525#else
3526 if (thread_reg == noreg) {
3527 thread_reg = temp_reg;
3528 get_thread(thread_reg);
3529 }
3530#endif
3531 testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3532 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3533 } else {
3534 cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3535 SafepointSynchronize::_not_synchronized);
3536 jcc(Assembler::notEqual, slow_path);
3537 }
3538}
3539
3540// Calls to C land
3541//
3542// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3543// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3544// has to be reset to 0. This is required to allow proper stack traversal.
3545void MacroAssembler::set_last_Java_frame(Register java_thread,
3546 Register last_java_sp,
3547 Register last_java_fp,
3548 address last_java_pc) {
3549 vzeroupper();
3550 // determine java_thread register
3551 if (!java_thread->is_valid()) {
3552 java_thread = rdi;
3553 get_thread(java_thread);
3554 }
3555 // determine last_java_sp register
3556 if (!last_java_sp->is_valid()) {
3557 last_java_sp = rsp;
3558 }
3559
3560 // last_java_fp is optional
3561
3562 if (last_java_fp->is_valid()) {
3563 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3564 }
3565
3566 // last_java_pc is optional
3567
3568 if (last_java_pc != NULL) {
3569 lea(Address(java_thread,
3570 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3571 InternalAddress(last_java_pc));
3572
3573 }
3574 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3575}
3576
3577void MacroAssembler::shlptr(Register dst, int imm8) {
3578 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3579}
3580
3581void MacroAssembler::shrptr(Register dst, int imm8) {
3582 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3583}
3584
3585void MacroAssembler::sign_extend_byte(Register reg) {
3586 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3587 movsbl(reg, reg); // movsxb
3588 } else {
3589 shll(reg, 24);
3590 sarl(reg, 24);
3591 }
3592}
3593
3594void MacroAssembler::sign_extend_short(Register reg) {
3595 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3596 movswl(reg, reg); // movsxw
3597 } else {
3598 shll(reg, 16);
3599 sarl(reg, 16);
3600 }
3601}
3602
3603void MacroAssembler::testl(Register dst, AddressLiteral src) {
3604 assert(reachable(src), "Address should be reachable");
3605 testl(dst, as_Address(src));
3606}
3607
3608void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3609 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3610 Assembler::pcmpeqb(dst, src);
3611}
3612
3613void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3614 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3615 Assembler::pcmpeqw(dst, src);
3616}
3617
3618void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3619 assert((dst->encoding() < 16),"XMM register should be 0-15");
3620 Assembler::pcmpestri(dst, src, imm8);
3621}
3622
3623void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3624 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3625 Assembler::pcmpestri(dst, src, imm8);
3626}
3627
3628void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3629 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3630 Assembler::pmovzxbw(dst, src);
3631}
3632
3633void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3634 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3635 Assembler::pmovzxbw(dst, src);
3636}
3637
3638void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3639 assert((src->encoding() < 16),"XMM register should be 0-15");
3640 Assembler::pmovmskb(dst, src);
3641}
3642
3643void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3644 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3645 Assembler::ptest(dst, src);
3646}
3647
3648void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3649 if (reachable(src)) {
3650 Assembler::sqrtsd(dst, as_Address(src));
3651 } else {
3652 lea(rscratch1, src);
3653 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3654 }
3655}
3656
3657void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3658 if (reachable(src)) {
3659 Assembler::sqrtss(dst, as_Address(src));
3660 } else {
3661 lea(rscratch1, src);
3662 Assembler::sqrtss(dst, Address(rscratch1, 0));
3663 }
3664}
3665
3666void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3667 if (reachable(src)) {
3668 Assembler::subsd(dst, as_Address(src));
3669 } else {
3670 lea(rscratch1, src);
3671 Assembler::subsd(dst, Address(rscratch1, 0));
3672 }
3673}
3674
3675void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3676 if (reachable(src)) {
3677 Assembler::subss(dst, as_Address(src));
3678 } else {
3679 lea(rscratch1, src);
3680 Assembler::subss(dst, Address(rscratch1, 0));
3681 }
3682}
3683
3684void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3685 if (reachable(src)) {
3686 Assembler::ucomisd(dst, as_Address(src));
3687 } else {
3688 lea(rscratch1, src);
3689 Assembler::ucomisd(dst, Address(rscratch1, 0));
3690 }
3691}
3692
3693void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3694 if (reachable(src)) {
3695 Assembler::ucomiss(dst, as_Address(src));
3696 } else {
3697 lea(rscratch1, src);
3698 Assembler::ucomiss(dst, Address(rscratch1, 0));
3699 }
3700}
3701
3702void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3703 // Used in sign-bit flipping with aligned address.
3704 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3705 if (reachable(src)) {
3706 Assembler::xorpd(dst, as_Address(src));
3707 } else {
3708 lea(scratch_reg, src);
3709 Assembler::xorpd(dst, Address(scratch_reg, 0));
3710 }
3711}
3712
3713void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3714 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3715 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3716 }
3717 else {
3718 Assembler::xorpd(dst, src);
3719 }
3720}
3721
3722void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3723 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3724 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3725 } else {
3726 Assembler::xorps(dst, src);
3727 }
3728}
3729
3730void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3731 // Used in sign-bit flipping with aligned address.
3732 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3733 if (reachable(src)) {
3734 Assembler::xorps(dst, as_Address(src));
3735 } else {
3736 lea(scratch_reg, src);
3737 Assembler::xorps(dst, Address(scratch_reg, 0));
3738 }
3739}
3740
3741void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3742 // Used in sign-bit flipping with aligned address.
3743 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3744 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3745 if (reachable(src)) {
3746 Assembler::pshufb(dst, as_Address(src));
3747 } else {
3748 lea(rscratch1, src);
3749 Assembler::pshufb(dst, Address(rscratch1, 0));
3750 }
3751}
3752
3753// AVX 3-operands instructions
3754
3755void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3756 if (reachable(src)) {
3757 vaddsd(dst, nds, as_Address(src));
3758 } else {
3759 lea(rscratch1, src);
3760 vaddsd(dst, nds, Address(rscratch1, 0));
3761 }
3762}
3763
3764void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3765 if (reachable(src)) {
3766 vaddss(dst, nds, as_Address(src));
3767 } else {
3768 lea(rscratch1, src);
3769 vaddss(dst, nds, Address(rscratch1, 0));
3770 }
3771}
3772
3773void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3774 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3775 vandps(dst, nds, negate_field, vector_len);
3776}
3777
3778void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3779 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3780 vandpd(dst, nds, negate_field, vector_len);
3781}
3782
3783void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3784 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3785 Assembler::vpaddb(dst, nds, src, vector_len);
3786}
3787
3788void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3789 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3790 Assembler::vpaddb(dst, nds, src, vector_len);
3791}
3792
3793void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3794 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3795 Assembler::vpaddw(dst, nds, src, vector_len);
3796}
3797
3798void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3799 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3800 Assembler::vpaddw(dst, nds, src, vector_len);
3801}
3802
3803void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3804 if (reachable(src)) {
3805 Assembler::vpand(dst, nds, as_Address(src), vector_len);
3806 } else {
3807 lea(scratch_reg, src);
3808 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3809 }
3810}
3811
3812void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3813 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3814 Assembler::vpbroadcastw(dst, src, vector_len);
3815}
3816
3817void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3818 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3819 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3820}
3821
3822void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3823 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3824 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3825}
3826
3827void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3828 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3829 Assembler::vpmovzxbw(dst, src, vector_len);
3830}
3831
3832void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3833 assert((src->encoding() < 16),"XMM register should be 0-15");
3834 Assembler::vpmovmskb(dst, src);
3835}
3836
3837void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3838 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3839 Assembler::vpmullw(dst, nds, src, vector_len);
3840}
3841
3842void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3843 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3844 Assembler::vpmullw(dst, nds, src, vector_len);
3845}
3846
3847void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3848 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3849 Assembler::vpsubb(dst, nds, src, vector_len);
3850}
3851
3852void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3853 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3854 Assembler::vpsubb(dst, nds, src, vector_len);
3855}
3856
3857void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3858 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3859 Assembler::vpsubw(dst, nds, src, vector_len);
3860}
3861
3862void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3863 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3864 Assembler::vpsubw(dst, nds, src, vector_len);
3865}
3866
3867void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3868 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3869 Assembler::vpsraw(dst, nds, shift, vector_len);
3870}
3871
3872void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3873 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3874 Assembler::vpsraw(dst, nds, shift, vector_len);
3875}
3876
3877void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3878 assert(UseAVX > 2,"");
3879 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3880 vector_len = 2;
3881 }
3882 Assembler::evpsraq(dst, nds, shift, vector_len);
3883}
3884
3885void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3886 assert(UseAVX > 2,"");
3887 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3888 vector_len = 2;
3889 }
3890 Assembler::evpsraq(dst, nds, shift, vector_len);
3891}
3892
3893void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3894 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3895 Assembler::vpsrlw(dst, nds, shift, vector_len);
3896}
3897
3898void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3899 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3900 Assembler::vpsrlw(dst, nds, shift, vector_len);
3901}
3902
3903void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3904 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3905 Assembler::vpsllw(dst, nds, shift, vector_len);
3906}
3907
3908void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3909 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3910 Assembler::vpsllw(dst, nds, shift, vector_len);
3911}
3912
3913void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3914 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3915 Assembler::vptest(dst, src);
3916}
3917
3918void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3919 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3920 Assembler::punpcklbw(dst, src);
3921}
3922
3923void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3924 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3925 Assembler::pshufd(dst, src, mode);
3926}
3927
3928void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3929 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3930 Assembler::pshuflw(dst, src, mode);
3931}
3932
3933void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3934 if (reachable(src)) {
3935 vandpd(dst, nds, as_Address(src), vector_len);
3936 } else {
3937 lea(scratch_reg, src);
3938 vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3939 }
3940}
3941
3942void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3943 if (reachable(src)) {
3944 vandps(dst, nds, as_Address(src), vector_len);
3945 } else {
3946 lea(scratch_reg, src);
3947 vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3948 }
3949}
3950
3951void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3952 if (reachable(src)) {
3953 vdivsd(dst, nds, as_Address(src));
3954 } else {
3955 lea(rscratch1, src);
3956 vdivsd(dst, nds, Address(rscratch1, 0));
3957 }
3958}
3959
3960void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3961 if (reachable(src)) {
3962 vdivss(dst, nds, as_Address(src));
3963 } else {
3964 lea(rscratch1, src);
3965 vdivss(dst, nds, Address(rscratch1, 0));
3966 }
3967}
3968
3969void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3970 if (reachable(src)) {
3971 vmulsd(dst, nds, as_Address(src));
3972 } else {
3973 lea(rscratch1, src);
3974 vmulsd(dst, nds, Address(rscratch1, 0));
3975 }
3976}
3977
3978void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3979 if (reachable(src)) {
3980 vmulss(dst, nds, as_Address(src));
3981 } else {
3982 lea(rscratch1, src);
3983 vmulss(dst, nds, Address(rscratch1, 0));
3984 }
3985}
3986
3987void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3988 if (reachable(src)) {
3989 vsubsd(dst, nds, as_Address(src));
3990 } else {
3991 lea(rscratch1, src);
3992 vsubsd(dst, nds, Address(rscratch1, 0));
3993 }
3994}
3995
3996void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3997 if (reachable(src)) {
3998 vsubss(dst, nds, as_Address(src));
3999 } else {
4000 lea(rscratch1, src);
4001 vsubss(dst, nds, Address(rscratch1, 0));
4002 }
4003}
4004
4005void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4006 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4007 vxorps(dst, nds, src, Assembler::AVX_128bit);
4008}
4009
4010void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4011 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4012 vxorpd(dst, nds, src, Assembler::AVX_128bit);
4013}
4014
4015void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4016 if (reachable(src)) {
4017 vxorpd(dst, nds, as_Address(src), vector_len);
4018 } else {
4019 lea(scratch_reg, src);
4020 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4021 }
4022}
4023
4024void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4025 if (reachable(src)) {
4026 vxorps(dst, nds, as_Address(src), vector_len);
4027 } else {
4028 lea(scratch_reg, src);
4029 vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4030 }
4031}
4032
4033void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4034 if (UseAVX > 1 || (vector_len < 1)) {
4035 if (reachable(src)) {
4036 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4037 } else {
4038 lea(scratch_reg, src);
4039 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4040 }
4041 }
4042 else {
4043 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4044 }
4045}
4046
4047//-------------------------------------------------------------------------------------------
4048#ifdef COMPILER2
4049// Generic instructions support for use in .ad files C2 code generation
4050
4051void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4052 if (opcode == Op_AbsVD) {
4053 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4054 } else {
4055 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4056 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4057 }
4058}
4059
4060void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4061 if (opcode == Op_AbsVD) {
4062 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4063 } else {
4064 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4065 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4066 }
4067}
4068
4069void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4070 if (opcode == Op_AbsVF) {
4071 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4072 } else {
4073 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4074 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4075 }
4076}
4077
4078void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4079 if (opcode == Op_AbsVF) {
4080 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4081 } else {
4082 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4083 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4084 }
4085}
4086
4087void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4088 if (sign) {
4089 pmovsxbw(dst, src);
4090 } else {
4091 pmovzxbw(dst, src);
4092 }
4093}
4094
4095void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4096 if (sign) {
4097 vpmovsxbw(dst, src, vector_len);
4098 } else {
4099 vpmovzxbw(dst, src, vector_len);
4100 }
4101}
4102
4103void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4104 if (opcode == Op_RShiftVI) {
4105 psrad(dst, src);
4106 } else if (opcode == Op_LShiftVI) {
4107 pslld(dst, src);
4108 } else {
4109 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4110 psrld(dst, src);
4111 }
4112}
4113
4114void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4115 if (opcode == Op_RShiftVI) {
4116 vpsrad(dst, nds, src, vector_len);
4117 } else if (opcode == Op_LShiftVI) {
4118 vpslld(dst, nds, src, vector_len);
4119 } else {
4120 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4121 vpsrld(dst, nds, src, vector_len);
4122 }
4123}
4124
4125void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4126 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4127 psraw(dst, src);
4128 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4129 psllw(dst, src);
4130 } else {
4131 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4132 psrlw(dst, src);
4133 }
4134}
4135
4136void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4137 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4138 vpsraw(dst, nds, src, vector_len);
4139 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4140 vpsllw(dst, nds, src, vector_len);
4141 } else {
4142 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4143 vpsrlw(dst, nds, src, vector_len);
4144 }
4145}
4146
4147void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4148 if (opcode == Op_RShiftVL) {
4149 psrlq(dst, src); // using srl to implement sra on pre-avs512 systems
4150 } else if (opcode == Op_LShiftVL) {
4151 psllq(dst, src);
4152 } else {
4153 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4154 psrlq(dst, src);
4155 }
4156}
4157
4158void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4159 if (opcode == Op_RShiftVL) {
4160 evpsraq(dst, nds, src, vector_len);
4161 } else if (opcode == Op_LShiftVL) {
4162 vpsllq(dst, nds, src, vector_len);
4163 } else {
4164 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4165 vpsrlq(dst, nds, src, vector_len);
4166 }
4167}
4168#endif
4169//-------------------------------------------------------------------------------------------
4170
4171void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4172 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4173 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4174 // The inverted mask is sign-extended
4175 andptr(possibly_jweak, inverted_jweak_mask);
4176}
4177
4178void MacroAssembler::resolve_jobject(Register value,
4179 Register thread,
4180 Register tmp) {
4181 assert_different_registers(value, thread, tmp);
4182 Label done, not_weak;
4183 testptr(value, value);
4184 jcc(Assembler::zero, done); // Use NULL as-is.
4185 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4186 jcc(Assembler::zero, not_weak);
4187 // Resolve jweak.
4188 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4189 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4190 verify_oop(value);
4191 jmp(done);
4192 bind(not_weak);
4193 // Resolve (untagged) jobject.
4194 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4195 verify_oop(value);
4196 bind(done);
4197}
4198
4199void MacroAssembler::subptr(Register dst, int32_t imm32) {
4200 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4201}
4202
4203// Force generation of a 4 byte immediate value even if it fits into 8bit
4204void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4205 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4206}
4207
4208void MacroAssembler::subptr(Register dst, Register src) {
4209 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4210}
4211
4212// C++ bool manipulation
4213void MacroAssembler::testbool(Register dst) {
4214 if(sizeof(bool) == 1)
4215 testb(dst, 0xff);
4216 else if(sizeof(bool) == 2) {
4217 // testw implementation needed for two byte bools
4218 ShouldNotReachHere();
4219 } else if(sizeof(bool) == 4)
4220 testl(dst, dst);
4221 else
4222 // unsupported
4223 ShouldNotReachHere();
4224}
4225
4226void MacroAssembler::testptr(Register dst, Register src) {
4227 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4228}
4229
4230// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4231void MacroAssembler::tlab_allocate(Register thread, Register obj,
4232 Register var_size_in_bytes,
4233 int con_size_in_bytes,
4234 Register t1,
4235 Register t2,
4236 Label& slow_case) {
4237 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4238 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4239}
4240
4241// Defines obj, preserves var_size_in_bytes
4242void MacroAssembler::eden_allocate(Register thread, Register obj,
4243 Register var_size_in_bytes,
4244 int con_size_in_bytes,
4245 Register t1,
4246 Label& slow_case) {
4247 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4248 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4249}
4250
4251// Preserves the contents of address, destroys the contents length_in_bytes and temp.
4252void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4253 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4254 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4255 Label done;
4256
4257 testptr(length_in_bytes, length_in_bytes);
4258 jcc(Assembler::zero, done);
4259
4260 // initialize topmost word, divide index by 2, check if odd and test if zero
4261 // note: for the remaining code to work, index must be a multiple of BytesPerWord
4262#ifdef ASSERT
4263 {
4264 Label L;
4265 testptr(length_in_bytes, BytesPerWord - 1);
4266 jcc(Assembler::zero, L);
4267 stop("length must be a multiple of BytesPerWord");
4268 bind(L);
4269 }
4270#endif
4271 Register index = length_in_bytes;
4272 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
4273 if (UseIncDec) {
4274 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
4275 } else {
4276 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
4277 shrptr(index, 1);
4278 }
4279#ifndef _LP64
4280 // index could have not been a multiple of 8 (i.e., bit 2 was set)
4281 {
4282 Label even;
4283 // note: if index was a multiple of 8, then it cannot
4284 // be 0 now otherwise it must have been 0 before
4285 // => if it is even, we don't need to check for 0 again
4286 jcc(Assembler::carryClear, even);
4287 // clear topmost word (no jump would be needed if conditional assignment worked here)
4288 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4289 // index could be 0 now, must check again
4290 jcc(Assembler::zero, done);
4291 bind(even);
4292 }
4293#endif // !_LP64
4294 // initialize remaining object fields: index is a multiple of 2 now
4295 {
4296 Label loop;
4297 bind(loop);
4298 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4299 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4300 decrement(index);
4301 jcc(Assembler::notZero, loop);
4302 }
4303
4304 bind(done);
4305}
4306
4307// Look up the method for a megamorphic invokeinterface call.
4308// The target method is determined by <intf_klass, itable_index>.
4309// The receiver klass is in recv_klass.
4310// On success, the result will be in method_result, and execution falls through.
4311// On failure, execution transfers to the given label.
4312void MacroAssembler::lookup_interface_method(Register recv_klass,
4313 Register intf_klass,
4314 RegisterOrConstant itable_index,
4315 Register method_result,
4316 Register scan_temp,
4317 Label& L_no_such_interface,
4318 bool return_method) {
4319 assert_different_registers(recv_klass, intf_klass, scan_temp);
4320 assert_different_registers(method_result, intf_klass, scan_temp);
4321 assert(recv_klass != method_result || !return_method,
4322 "recv_klass can be destroyed when method isn't needed");
4323
4324 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4325 "caller must use same register for non-constant itable index as for method");
4326
4327 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4328 int vtable_base = in_bytes(Klass::vtable_start_offset());
4329 int itentry_off = itableMethodEntry::method_offset_in_bytes();
4330 int scan_step = itableOffsetEntry::size() * wordSize;
4331 int vte_size = vtableEntry::size_in_bytes();
4332 Address::ScaleFactor times_vte_scale = Address::times_ptr;
4333 assert(vte_size == wordSize, "else adjust times_vte_scale");
4334
4335 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4336
4337 // %%% Could store the aligned, prescaled offset in the klassoop.
4338 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4339
4340 if (return_method) {
4341 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4342 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4343 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4344 }
4345
4346 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4347 // if (scan->interface() == intf) {
4348 // result = (klass + scan->offset() + itable_index);
4349 // }
4350 // }
4351 Label search, found_method;
4352
4353 for (int peel = 1; peel >= 0; peel--) {
4354 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4355 cmpptr(intf_klass, method_result);
4356
4357 if (peel) {
4358 jccb(Assembler::equal, found_method);
4359 } else {
4360 jccb(Assembler::notEqual, search);
4361 // (invert the test to fall through to found_method...)
4362 }
4363
4364 if (!peel) break;
4365
4366 bind(search);
4367
4368 // Check that the previous entry is non-null. A null entry means that
4369 // the receiver class doesn't implement the interface, and wasn't the
4370 // same as when the caller was compiled.
4371 testptr(method_result, method_result);
4372 jcc(Assembler::zero, L_no_such_interface);
4373 addptr(scan_temp, scan_step);
4374 }
4375
4376 bind(found_method);
4377
4378 if (return_method) {
4379 // Got a hit.
4380 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4381 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4382 }
4383}
4384
4385
4386// virtual method calling
4387void MacroAssembler::lookup_virtual_method(Register recv_klass,
4388 RegisterOrConstant vtable_index,
4389 Register method_result) {
4390 const int base = in_bytes(Klass::vtable_start_offset());
4391 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4392 Address vtable_entry_addr(recv_klass,
4393 vtable_index, Address::times_ptr,
4394 base + vtableEntry::method_offset_in_bytes());
4395 movptr(method_result, vtable_entry_addr);
4396}
4397
4398
4399void MacroAssembler::check_klass_subtype(Register sub_klass,
4400 Register super_klass,
4401 Register temp_reg,
4402 Label& L_success) {
4403 Label L_failure;
4404 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
4405 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4406 bind(L_failure);
4407}
4408
4409
4410void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4411 Register super_klass,
4412 Register temp_reg,
4413 Label* L_success,
4414 Label* L_failure,
4415 Label* L_slow_path,
4416 RegisterOrConstant super_check_offset) {
4417 assert_different_registers(sub_klass, super_klass, temp_reg);
4418 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4419 if (super_check_offset.is_register()) {
4420 assert_different_registers(sub_klass, super_klass,
4421 super_check_offset.as_register());
4422 } else if (must_load_sco) {
4423 assert(temp_reg != noreg, "supply either a temp or a register offset");
4424 }
4425
4426 Label L_fallthrough;
4427 int label_nulls = 0;
4428 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4429 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4430 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4431 assert(label_nulls <= 1, "at most one NULL in the batch");
4432
4433 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4434 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4435 Address super_check_offset_addr(super_klass, sco_offset);
4436
4437 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4438 // range of a jccb. If this routine grows larger, reconsider at
4439 // least some of these.
4440#define local_jcc(assembler_cond, label) \
4441 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4442 else jcc( assembler_cond, label) /*omit semi*/
4443
4444 // Hacked jmp, which may only be used just before L_fallthrough.
4445#define final_jmp(label) \
4446 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4447 else jmp(label) /*omit semi*/
4448
4449 // If the pointers are equal, we are done (e.g., String[] elements).
4450 // This self-check enables sharing of secondary supertype arrays among
4451 // non-primary types such as array-of-interface. Otherwise, each such
4452 // type would need its own customized SSA.
4453 // We move this check to the front of the fast path because many
4454 // type checks are in fact trivially successful in this manner,
4455 // so we get a nicely predicted branch right at the start of the check.
4456 cmpptr(sub_klass, super_klass);
4457 local_jcc(Assembler::equal, *L_success);
4458
4459 // Check the supertype display:
4460 if (must_load_sco) {
4461 // Positive movl does right thing on LP64.
4462 movl(temp_reg, super_check_offset_addr);
4463 super_check_offset = RegisterOrConstant(temp_reg);
4464 }
4465 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4466 cmpptr(super_klass, super_check_addr); // load displayed supertype
4467
4468 // This check has worked decisively for primary supers.
4469 // Secondary supers are sought in the super_cache ('super_cache_addr').
4470 // (Secondary supers are interfaces and very deeply nested subtypes.)
4471 // This works in the same check above because of a tricky aliasing
4472 // between the super_cache and the primary super display elements.
4473 // (The 'super_check_addr' can address either, as the case requires.)
4474 // Note that the cache is updated below if it does not help us find
4475 // what we need immediately.
4476 // So if it was a primary super, we can just fail immediately.
4477 // Otherwise, it's the slow path for us (no success at this point).
4478
4479 if (super_check_offset.is_register()) {
4480 local_jcc(Assembler::equal, *L_success);
4481 cmpl(super_check_offset.as_register(), sc_offset);
4482 if (L_failure == &L_fallthrough) {
4483 local_jcc(Assembler::equal, *L_slow_path);
4484 } else {
4485 local_jcc(Assembler::notEqual, *L_failure);
4486 final_jmp(*L_slow_path);
4487 }
4488 } else if (super_check_offset.as_constant() == sc_offset) {
4489 // Need a slow path; fast failure is impossible.
4490 if (L_slow_path == &L_fallthrough) {
4491 local_jcc(Assembler::equal, *L_success);
4492 } else {
4493 local_jcc(Assembler::notEqual, *L_slow_path);
4494 final_jmp(*L_success);
4495 }
4496 } else {
4497 // No slow path; it's a fast decision.
4498 if (L_failure == &L_fallthrough) {
4499 local_jcc(Assembler::equal, *L_success);
4500 } else {
4501 local_jcc(Assembler::notEqual, *L_failure);
4502 final_jmp(*L_success);
4503 }
4504 }
4505
4506 bind(L_fallthrough);
4507
4508#undef local_jcc
4509#undef final_jmp
4510}
4511
4512
4513void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4514 Register super_klass,
4515 Register temp_reg,
4516 Register temp2_reg,
4517 Label* L_success,
4518 Label* L_failure,
4519 bool set_cond_codes) {
4520 assert_different_registers(sub_klass, super_klass, temp_reg);
4521 if (temp2_reg != noreg)
4522 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4523#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4524
4525 Label L_fallthrough;
4526 int label_nulls = 0;
4527 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4528 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4529 assert(label_nulls <= 1, "at most one NULL in the batch");
4530
4531 // a couple of useful fields in sub_klass:
4532 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4533 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4534 Address secondary_supers_addr(sub_klass, ss_offset);
4535 Address super_cache_addr( sub_klass, sc_offset);
4536
4537 // Do a linear scan of the secondary super-klass chain.
4538 // This code is rarely used, so simplicity is a virtue here.
4539 // The repne_scan instruction uses fixed registers, which we must spill.
4540 // Don't worry too much about pre-existing connections with the input regs.
4541
4542 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4543 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4544
4545 // Get super_klass value into rax (even if it was in rdi or rcx).
4546 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4547 if (super_klass != rax || UseCompressedOops) {
4548 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4549 mov(rax, super_klass);
4550 }
4551 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4552 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4553
4554#ifndef PRODUCT
4555 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4556 ExternalAddress pst_counter_addr((address) pst_counter);
4557 NOT_LP64( incrementl(pst_counter_addr) );
4558 LP64_ONLY( lea(rcx, pst_counter_addr) );
4559 LP64_ONLY( incrementl(Address(rcx, 0)) );
4560#endif //PRODUCT
4561
4562 // We will consult the secondary-super array.
4563 movptr(rdi, secondary_supers_addr);
4564 // Load the array length. (Positive movl does right thing on LP64.)
4565 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4566 // Skip to start of data.
4567 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4568
4569 // Scan RCX words at [RDI] for an occurrence of RAX.
4570 // Set NZ/Z based on last compare.
4571 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4572 // not change flags (only scas instruction which is repeated sets flags).
4573 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4574
4575 testptr(rax,rax); // Set Z = 0
4576 repne_scan();
4577
4578 // Unspill the temp. registers:
4579 if (pushed_rdi) pop(rdi);
4580 if (pushed_rcx) pop(rcx);
4581 if (pushed_rax) pop(rax);
4582
4583 if (set_cond_codes) {
4584 // Special hack for the AD files: rdi is guaranteed non-zero.
4585 assert(!pushed_rdi, "rdi must be left non-NULL");
4586 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4587 }
4588
4589 if (L_failure == &L_fallthrough)
4590 jccb(Assembler::notEqual, *L_failure);
4591 else jcc(Assembler::notEqual, *L_failure);
4592
4593 // Success. Cache the super we found and proceed in triumph.
4594 movptr(super_cache_addr, super_klass);
4595
4596 if (L_success != &L_fallthrough) {
4597 jmp(*L_success);
4598 }
4599
4600#undef IS_A_TEMP
4601
4602 bind(L_fallthrough);
4603}
4604
4605void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4606 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4607
4608 Label L_fallthrough;
4609 if (L_fast_path == NULL) {
4610 L_fast_path = &L_fallthrough;
4611 } else if (L_slow_path == NULL) {
4612 L_slow_path = &L_fallthrough;
4613 }
4614
4615 // Fast path check: class is fully initialized
4616 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4617 jcc(Assembler::equal, *L_fast_path);
4618
4619 // Fast path check: current thread is initializer thread
4620 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4621 if (L_slow_path == &L_fallthrough) {
4622 jcc(Assembler::equal, *L_fast_path);
4623 bind(*L_slow_path);
4624 } else if (L_fast_path == &L_fallthrough) {
4625 jcc(Assembler::notEqual, *L_slow_path);
4626 bind(*L_fast_path);
4627 } else {
4628 Unimplemented();
4629 }
4630}
4631
4632void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4633 if (VM_Version::supports_cmov()) {
4634 cmovl(cc, dst, src);
4635 } else {
4636 Label L;
4637 jccb(negate_condition(cc), L);
4638 movl(dst, src);
4639 bind(L);
4640 }
4641}
4642
4643void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4644 if (VM_Version::supports_cmov()) {
4645 cmovl(cc, dst, src);
4646 } else {
4647 Label L;
4648 jccb(negate_condition(cc), L);
4649 movl(dst, src);
4650 bind(L);
4651 }
4652}
4653
4654void MacroAssembler::verify_oop(Register reg, const char* s) {
4655 if (!VerifyOops) return;
4656
4657 // Pass register number to verify_oop_subroutine
4658 const char* b = NULL;
4659 {
4660 ResourceMark rm;
4661 stringStream ss;
4662 ss.print("verify_oop: %s: %s", reg->name(), s);
4663 b = code_string(ss.as_string());
4664 }
4665 BLOCK_COMMENT("verify_oop {");
4666#ifdef _LP64
4667 push(rscratch1); // save r10, trashed by movptr()
4668#endif
4669 push(rax); // save rax,
4670 push(reg); // pass register argument
4671 ExternalAddress buffer((address) b);
4672 // avoid using pushptr, as it modifies scratch registers
4673 // and our contract is not to modify anything
4674 movptr(rax, buffer.addr());
4675 push(rax);
4676 // call indirectly to solve generation ordering problem
4677 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4678 call(rax);
4679 // Caller pops the arguments (oop, message) and restores rax, r10
4680 BLOCK_COMMENT("} verify_oop");
4681}
4682
4683
4684RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4685 Register tmp,
4686 int offset) {
4687 intptr_t value = *delayed_value_addr;
4688 if (value != 0)
4689 return RegisterOrConstant(value + offset);
4690
4691 // load indirectly to solve generation ordering problem
4692 movptr(tmp, ExternalAddress((address) delayed_value_addr));
4693
4694#ifdef ASSERT
4695 { Label L;
4696 testptr(tmp, tmp);
4697 if (WizardMode) {
4698 const char* buf = NULL;
4699 {
4700 ResourceMark rm;
4701 stringStream ss;
4702 ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4703 buf = code_string(ss.as_string());
4704 }
4705 jcc(Assembler::notZero, L);
4706 STOP(buf);
4707 } else {
4708 jccb(Assembler::notZero, L);
4709 hlt();
4710 }
4711 bind(L);
4712 }
4713#endif
4714
4715 if (offset != 0)
4716 addptr(tmp, offset);
4717
4718 return RegisterOrConstant(tmp);
4719}
4720
4721
4722Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4723 int extra_slot_offset) {
4724 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4725 int stackElementSize = Interpreter::stackElementSize;
4726 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4727#ifdef ASSERT
4728 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4729 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4730#endif
4731 Register scale_reg = noreg;
4732 Address::ScaleFactor scale_factor = Address::no_scale;
4733 if (arg_slot.is_constant()) {
4734 offset += arg_slot.as_constant() * stackElementSize;
4735 } else {
4736 scale_reg = arg_slot.as_register();
4737 scale_factor = Address::times(stackElementSize);
4738 }
4739 offset += wordSize; // return PC is on stack
4740 return Address(rsp, scale_reg, scale_factor, offset);
4741}
4742
4743
4744void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4745 if (!VerifyOops) return;
4746
4747 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4748 // Pass register number to verify_oop_subroutine
4749 const char* b = NULL;
4750 {
4751 ResourceMark rm;
4752 stringStream ss;
4753 ss.print("verify_oop_addr: %s", s);
4754 b = code_string(ss.as_string());
4755 }
4756#ifdef _LP64
4757 push(rscratch1); // save r10, trashed by movptr()
4758#endif
4759 push(rax); // save rax,
4760 // addr may contain rsp so we will have to adjust it based on the push
4761 // we just did (and on 64 bit we do two pushes)
4762 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4763 // stores rax into addr which is backwards of what was intended.
4764 if (addr.uses(rsp)) {
4765 lea(rax, addr);
4766 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4767 } else {
4768 pushptr(addr);
4769 }
4770
4771 ExternalAddress buffer((address) b);
4772 // pass msg argument
4773 // avoid using pushptr, as it modifies scratch registers
4774 // and our contract is not to modify anything
4775 movptr(rax, buffer.addr());
4776 push(rax);
4777
4778 // call indirectly to solve generation ordering problem
4779 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4780 call(rax);
4781 // Caller pops the arguments (addr, message) and restores rax, r10.
4782}
4783
4784void MacroAssembler::verify_tlab() {
4785#ifdef ASSERT
4786 if (UseTLAB && VerifyOops) {
4787 Label next, ok;
4788 Register t1 = rsi;
4789 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4790
4791 push(t1);
4792 NOT_LP64(push(thread_reg));
4793 NOT_LP64(get_thread(thread_reg));
4794
4795 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4796 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4797 jcc(Assembler::aboveEqual, next);
4798 STOP("assert(top >= start)");
4799 should_not_reach_here();
4800
4801 bind(next);
4802 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4803 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4804 jcc(Assembler::aboveEqual, ok);
4805 STOP("assert(top <= end)");
4806 should_not_reach_here();
4807
4808 bind(ok);
4809 NOT_LP64(pop(thread_reg));
4810 pop(t1);
4811 }
4812#endif
4813}
4814
4815class ControlWord {
4816 public:
4817 int32_t _value;
4818
4819 int rounding_control() const { return (_value >> 10) & 3 ; }
4820 int precision_control() const { return (_value >> 8) & 3 ; }
4821 bool precision() const { return ((_value >> 5) & 1) != 0; }
4822 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4823 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4824 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4825 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4826 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4827
4828 void print() const {
4829 // rounding control
4830 const char* rc;
4831 switch (rounding_control()) {
4832 case 0: rc = "round near"; break;
4833 case 1: rc = "round down"; break;
4834 case 2: rc = "round up "; break;
4835 case 3: rc = "chop "; break;
4836 };
4837 // precision control
4838 const char* pc;
4839 switch (precision_control()) {
4840 case 0: pc = "24 bits "; break;
4841 case 1: pc = "reserved"; break;
4842 case 2: pc = "53 bits "; break;
4843 case 3: pc = "64 bits "; break;
4844 };
4845 // flags
4846 char f[9];
4847 f[0] = ' ';
4848 f[1] = ' ';
4849 f[2] = (precision ()) ? 'P' : 'p';
4850 f[3] = (underflow ()) ? 'U' : 'u';
4851 f[4] = (overflow ()) ? 'O' : 'o';
4852 f[5] = (zero_divide ()) ? 'Z' : 'z';
4853 f[6] = (denormalized()) ? 'D' : 'd';
4854 f[7] = (invalid ()) ? 'I' : 'i';
4855 f[8] = '\x0';
4856 // output
4857 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4858 }
4859
4860};
4861
4862class StatusWord {
4863 public:
4864 int32_t _value;
4865
4866 bool busy() const { return ((_value >> 15) & 1) != 0; }
4867 bool C3() const { return ((_value >> 14) & 1) != 0; }
4868 bool C2() const { return ((_value >> 10) & 1) != 0; }
4869 bool C1() const { return ((_value >> 9) & 1) != 0; }
4870 bool C0() const { return ((_value >> 8) & 1) != 0; }
4871 int top() const { return (_value >> 11) & 7 ; }
4872 bool error_status() const { return ((_value >> 7) & 1) != 0; }
4873 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
4874 bool precision() const { return ((_value >> 5) & 1) != 0; }
4875 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4876 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4877 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4878 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4879 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4880
4881 void print() const {
4882 // condition codes
4883 char c[5];
4884 c[0] = (C3()) ? '3' : '-';
4885 c[1] = (C2()) ? '2' : '-';
4886 c[2] = (C1()) ? '1' : '-';
4887 c[3] = (C0()) ? '0' : '-';
4888 c[4] = '\x0';
4889 // flags
4890 char f[9];
4891 f[0] = (error_status()) ? 'E' : '-';
4892 f[1] = (stack_fault ()) ? 'S' : '-';
4893 f[2] = (precision ()) ? 'P' : '-';
4894 f[3] = (underflow ()) ? 'U' : '-';
4895 f[4] = (overflow ()) ? 'O' : '-';
4896 f[5] = (zero_divide ()) ? 'Z' : '-';
4897 f[6] = (denormalized()) ? 'D' : '-';
4898 f[7] = (invalid ()) ? 'I' : '-';
4899 f[8] = '\x0';
4900 // output
4901 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
4902 }
4903
4904};
4905
4906class TagWord {
4907 public:
4908 int32_t _value;
4909
4910 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
4911
4912 void print() const {
4913 printf("%04x", _value & 0xFFFF);
4914 }
4915
4916};
4917
4918class FPU_Register {
4919 public:
4920 int32_t _m0;
4921 int32_t _m1;
4922 int16_t _ex;
4923
4924 bool is_indefinite() const {
4925 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4926 }
4927
4928 void print() const {
4929 char sign = (_ex < 0) ? '-' : '+';
4930 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
4931 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
4932 };
4933
4934};
4935
4936class FPU_State {
4937 public:
4938 enum {
4939 register_size = 10,
4940 number_of_registers = 8,
4941 register_mask = 7
4942 };
4943
4944 ControlWord _control_word;
4945 StatusWord _status_word;
4946 TagWord _tag_word;
4947 int32_t _error_offset;
4948 int32_t _error_selector;
4949 int32_t _data_offset;
4950 int32_t _data_selector;
4951 int8_t _register[register_size * number_of_registers];
4952
4953 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4954 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
4955
4956 const char* tag_as_string(int tag) const {
4957 switch (tag) {
4958 case 0: return "valid";
4959 case 1: return "zero";
4960 case 2: return "special";
4961 case 3: return "empty";
4962 }
4963 ShouldNotReachHere();
4964 return NULL;
4965 }
4966
4967 void print() const {
4968 // print computation registers
4969 { int t = _status_word.top();
4970 for (int i = 0; i < number_of_registers; i++) {
4971 int j = (i - t) & register_mask;
4972 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4973 st(j)->print();
4974 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4975 }
4976 }
4977 printf("\n");
4978 // print control registers
4979 printf("ctrl = "); _control_word.print(); printf("\n");
4980 printf("stat = "); _status_word .print(); printf("\n");
4981 printf("tags = "); _tag_word .print(); printf("\n");
4982 }
4983
4984};
4985
4986class Flag_Register {
4987 public:
4988 int32_t _value;
4989
4990 bool overflow() const { return ((_value >> 11) & 1) != 0; }
4991 bool direction() const { return ((_value >> 10) & 1) != 0; }
4992 bool sign() const { return ((_value >> 7) & 1) != 0; }
4993 bool zero() const { return ((_value >> 6) & 1) != 0; }
4994 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
4995 bool parity() const { return ((_value >> 2) & 1) != 0; }
4996 bool carry() const { return ((_value >> 0) & 1) != 0; }
4997
4998 void print() const {
4999 // flags
5000 char f[8];
5001 f[0] = (overflow ()) ? 'O' : '-';
5002 f[1] = (direction ()) ? 'D' : '-';
5003 f[2] = (sign ()) ? 'S' : '-';
5004 f[3] = (zero ()) ? 'Z' : '-';
5005 f[4] = (auxiliary_carry()) ? 'A' : '-';
5006 f[5] = (parity ()) ? 'P' : '-';
5007 f[6] = (carry ()) ? 'C' : '-';
5008 f[7] = '\x0';
5009 // output
5010 printf("%08x flags = %s", _value, f);
5011 }
5012
5013};
5014
5015class IU_Register {
5016 public:
5017 int32_t _value;
5018
5019 void print() const {
5020 printf("%08x %11d", _value, _value);
5021 }
5022
5023};
5024
5025class IU_State {
5026 public:
5027 Flag_Register _eflags;
5028 IU_Register _rdi;
5029 IU_Register _rsi;
5030 IU_Register _rbp;
5031 IU_Register _rsp;
5032 IU_Register _rbx;
5033 IU_Register _rdx;
5034 IU_Register _rcx;
5035 IU_Register _rax;
5036
5037 void print() const {
5038 // computation registers
5039 printf("rax, = "); _rax.print(); printf("\n");
5040 printf("rbx, = "); _rbx.print(); printf("\n");
5041 printf("rcx = "); _rcx.print(); printf("\n");
5042 printf("rdx = "); _rdx.print(); printf("\n");
5043 printf("rdi = "); _rdi.print(); printf("\n");
5044 printf("rsi = "); _rsi.print(); printf("\n");
5045 printf("rbp, = "); _rbp.print(); printf("\n");
5046 printf("rsp = "); _rsp.print(); printf("\n");
5047 printf("\n");
5048 // control registers
5049 printf("flgs = "); _eflags.print(); printf("\n");
5050 }
5051};
5052
5053
5054class CPU_State {
5055 public:
5056 FPU_State _fpu_state;
5057 IU_State _iu_state;
5058
5059 void print() const {
5060 printf("--------------------------------------------------\n");
5061 _iu_state .print();
5062 printf("\n");
5063 _fpu_state.print();
5064 printf("--------------------------------------------------\n");
5065 }
5066
5067};
5068
5069
5070static void _print_CPU_state(CPU_State* state) {
5071 state->print();
5072};
5073
5074
5075void MacroAssembler::print_CPU_state() {
5076 push_CPU_state();
5077 push(rsp); // pass CPU state
5078 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5079 addptr(rsp, wordSize); // discard argument
5080 pop_CPU_state();
5081}
5082
5083
5084static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5085 static int counter = 0;
5086 FPU_State* fs = &state->_fpu_state;
5087 counter++;
5088 // For leaf calls, only verify that the top few elements remain empty.
5089 // We only need 1 empty at the top for C2 code.
5090 if( stack_depth < 0 ) {
5091 if( fs->tag_for_st(7) != 3 ) {
5092 printf("FPR7 not empty\n");
5093 state->print();
5094 assert(false, "error");
5095 return false;
5096 }
5097 return true; // All other stack states do not matter
5098 }
5099
5100 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5101 "bad FPU control word");
5102
5103 // compute stack depth
5104 int i = 0;
5105 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
5106 int d = i;
5107 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5108 // verify findings
5109 if (i != FPU_State::number_of_registers) {
5110 // stack not contiguous
5111 printf("%s: stack not contiguous at ST%d\n", s, i);
5112 state->print();
5113 assert(false, "error");
5114 return false;
5115 }
5116 // check if computed stack depth corresponds to expected stack depth
5117 if (stack_depth < 0) {
5118 // expected stack depth is -stack_depth or less
5119 if (d > -stack_depth) {
5120 // too many elements on the stack
5121 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5122 state->print();
5123 assert(false, "error");
5124 return false;
5125 }
5126 } else {
5127 // expected stack depth is stack_depth
5128 if (d != stack_depth) {
5129 // wrong stack depth
5130 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5131 state->print();
5132 assert(false, "error");
5133 return false;
5134 }
5135 }
5136 // everything is cool
5137 return true;
5138}
5139
5140
5141void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5142 if (!VerifyFPU) return;
5143 push_CPU_state();
5144 push(rsp); // pass CPU state
5145 ExternalAddress msg((address) s);
5146 // pass message string s
5147 pushptr(msg.addr());
5148 push(stack_depth); // pass stack depth
5149 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5150 addptr(rsp, 3 * wordSize); // discard arguments
5151 // check for error
5152 { Label L;
5153 testl(rax, rax);
5154 jcc(Assembler::notZero, L);
5155 int3(); // break if error condition
5156 bind(L);
5157 }
5158 pop_CPU_state();
5159}
5160
5161void MacroAssembler::restore_cpu_control_state_after_jni() {
5162 // Either restore the MXCSR register after returning from the JNI Call
5163 // or verify that it wasn't changed (with -Xcheck:jni flag).
5164 if (VM_Version::supports_sse()) {
5165 if (RestoreMXCSROnJNICalls) {
5166 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5167 } else if (CheckJNICalls) {
5168 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5169 }
5170 }
5171 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5172 vzeroupper();
5173 // Reset k1 to 0xffff.
5174
5175#ifdef COMPILER2
5176 if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5177 push(rcx);
5178 movl(rcx, 0xffff);
5179 kmovwl(k1, rcx);
5180 pop(rcx);
5181 }
5182#endif // COMPILER2
5183
5184#ifndef _LP64
5185 // Either restore the x87 floating pointer control word after returning
5186 // from the JNI call or verify that it wasn't changed.
5187 if (CheckJNICalls) {
5188 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5189 }
5190#endif // _LP64
5191}
5192
5193// ((OopHandle)result).resolve();
5194void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5195 assert_different_registers(result, tmp);
5196
5197 // Only 64 bit platforms support GCs that require a tmp register
5198 // Only IN_HEAP loads require a thread_tmp register
5199 // OopHandle::resolve is an indirection like jobject.
5200 access_load_at(T_OBJECT, IN_NATIVE,
5201 result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5202}
5203
5204// ((WeakHandle)result).resolve();
5205void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5206 assert_different_registers(rresult, rtmp);
5207 Label resolved;
5208
5209 // A null weak handle resolves to null.
5210 cmpptr(rresult, 0);
5211 jcc(Assembler::equal, resolved);
5212
5213 // Only 64 bit platforms support GCs that require a tmp register
5214 // Only IN_HEAP loads require a thread_tmp register
5215 // WeakHandle::resolve is an indirection like jweak.
5216 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5217 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5218 bind(resolved);
5219}
5220
5221void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5222 // get mirror
5223 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5224 load_method_holder(mirror, method);
5225 movptr(mirror, Address(mirror, mirror_offset));
5226 resolve_oop_handle(mirror, tmp);
5227}
5228
5229void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5230 load_method_holder(rresult, rmethod);
5231 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5232}
5233
5234void MacroAssembler::load_method_holder(Register holder, Register method) {
5235 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5236 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5237 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5238}
5239
5240void MacroAssembler::load_klass(Register dst, Register src) {
5241#ifdef _LP64
5242 if (UseCompressedClassPointers) {
5243 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5244 decode_klass_not_null(dst);
5245 } else
5246#endif
5247 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5248}
5249
5250void MacroAssembler::load_prototype_header(Register dst, Register src) {
5251 load_klass(dst, src);
5252 movptr(dst, Address(dst, Klass::prototype_header_offset()));
5253}
5254
5255void MacroAssembler::store_klass(Register dst, Register src) {
5256#ifdef _LP64
5257 if (UseCompressedClassPointers) {
5258 encode_klass_not_null(src);
5259 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5260 } else
5261#endif
5262 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5263}
5264
5265void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5266 Register tmp1, Register thread_tmp) {
5267 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5268 decorators = AccessInternal::decorator_fixup(decorators);
5269 bool as_raw = (decorators & AS_RAW) != 0;
5270 if (as_raw) {
5271 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5272 } else {
5273 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5274 }
5275}
5276
5277void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5278 Register tmp1, Register tmp2) {
5279 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5280 decorators = AccessInternal::decorator_fixup(decorators);
5281 bool as_raw = (decorators & AS_RAW) != 0;
5282 if (as_raw) {
5283 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5284 } else {
5285 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5286 }
5287}
5288
5289void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5290 // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5291 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5292 decorators |= ACCESS_READ | ACCESS_WRITE;
5293 }
5294 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5295 return bs->resolve(this, decorators, obj);
5296}
5297
5298void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5299 Register thread_tmp, DecoratorSet decorators) {
5300 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5301}
5302
5303// Doesn't do verfication, generates fixed size code
5304void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5305 Register thread_tmp, DecoratorSet decorators) {
5306 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5307}
5308
5309void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5310 Register tmp2, DecoratorSet decorators) {
5311 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
5312}
5313
5314// Used for storing NULLs.
5315void MacroAssembler::store_heap_oop_null(Address dst) {
5316 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
5317}
5318
5319#ifdef _LP64
5320void MacroAssembler::store_klass_gap(Register dst, Register src) {
5321 if (UseCompressedClassPointers) {
5322 // Store to klass gap in destination
5323 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5324 }
5325}
5326
5327#ifdef ASSERT
5328void MacroAssembler::verify_heapbase(const char* msg) {
5329 assert (UseCompressedOops, "should be compressed");
5330 assert (Universe::heap() != NULL, "java heap should be initialized");
5331 if (CheckCompressedOops) {
5332 Label ok;
5333 push(rscratch1); // cmpptr trashes rscratch1
5334 cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5335 jcc(Assembler::equal, ok);
5336 STOP(msg);
5337 bind(ok);
5338 pop(rscratch1);
5339 }
5340}
5341#endif
5342
5343// Algorithm must match oop.inline.hpp encode_heap_oop.
5344void MacroAssembler::encode_heap_oop(Register r) {
5345#ifdef ASSERT
5346 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5347#endif
5348 verify_oop(r, "broken oop in encode_heap_oop");
5349 if (CompressedOops::base() == NULL) {
5350 if (CompressedOops::shift() != 0) {
5351 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5352 shrq(r, LogMinObjAlignmentInBytes);
5353 }
5354 return;
5355 }
5356 testq(r, r);
5357 cmovq(Assembler::equal, r, r12_heapbase);
5358 subq(r, r12_heapbase);
5359 shrq(r, LogMinObjAlignmentInBytes);
5360}
5361
5362void MacroAssembler::encode_heap_oop_not_null(Register r) {
5363#ifdef ASSERT
5364 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5365 if (CheckCompressedOops) {
5366 Label ok;
5367 testq(r, r);
5368 jcc(Assembler::notEqual, ok);
5369 STOP("null oop passed to encode_heap_oop_not_null");
5370 bind(ok);
5371 }
5372#endif
5373 verify_oop(r, "broken oop in encode_heap_oop_not_null");
5374 if (CompressedOops::base() != NULL) {
5375 subq(r, r12_heapbase);
5376 }
5377 if (CompressedOops::shift() != 0) {
5378 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5379 shrq(r, LogMinObjAlignmentInBytes);
5380 }
5381}
5382
5383void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5384#ifdef ASSERT
5385 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5386 if (CheckCompressedOops) {
5387 Label ok;
5388 testq(src, src);
5389 jcc(Assembler::notEqual, ok);
5390 STOP("null oop passed to encode_heap_oop_not_null2");
5391 bind(ok);
5392 }
5393#endif
5394 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5395 if (dst != src) {
5396 movq(dst, src);
5397 }
5398 if (CompressedOops::base() != NULL) {
5399 subq(dst, r12_heapbase);
5400 }
5401 if (CompressedOops::shift() != 0) {
5402 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5403 shrq(dst, LogMinObjAlignmentInBytes);
5404 }
5405}
5406
5407void MacroAssembler::decode_heap_oop(Register r) {
5408#ifdef ASSERT
5409 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5410#endif
5411 if (CompressedOops::base() == NULL) {
5412 if (CompressedOops::shift() != 0) {
5413 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5414 shlq(r, LogMinObjAlignmentInBytes);
5415 }
5416 } else {
5417 Label done;
5418 shlq(r, LogMinObjAlignmentInBytes);
5419 jccb(Assembler::equal, done);
5420 addq(r, r12_heapbase);
5421 bind(done);
5422 }
5423 verify_oop(r, "broken oop in decode_heap_oop");
5424}
5425
5426void MacroAssembler::decode_heap_oop_not_null(Register r) {
5427 // Note: it will change flags
5428 assert (UseCompressedOops, "should only be used for compressed headers");
5429 assert (Universe::heap() != NULL, "java heap should be initialized");
5430 // Cannot assert, unverified entry point counts instructions (see .ad file)
5431 // vtableStubs also counts instructions in pd_code_size_limit.
5432 // Also do not verify_oop as this is called by verify_oop.
5433 if (CompressedOops::shift() != 0) {
5434 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5435 shlq(r, LogMinObjAlignmentInBytes);
5436 if (CompressedOops::base() != NULL) {
5437 addq(r, r12_heapbase);
5438 }
5439 } else {
5440 assert (CompressedOops::base() == NULL, "sanity");
5441 }
5442}
5443
5444void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5445 // Note: it will change flags
5446 assert (UseCompressedOops, "should only be used for compressed headers");
5447 assert (Universe::heap() != NULL, "java heap should be initialized");
5448 // Cannot assert, unverified entry point counts instructions (see .ad file)
5449 // vtableStubs also counts instructions in pd_code_size_limit.
5450 // Also do not verify_oop as this is called by verify_oop.
5451 if (CompressedOops::shift() != 0) {
5452 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5453 if (LogMinObjAlignmentInBytes == Address::times_8) {
5454 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5455 } else {
5456 if (dst != src) {
5457 movq(dst, src);
5458 }
5459 shlq(dst, LogMinObjAlignmentInBytes);
5460 if (CompressedOops::base() != NULL) {
5461 addq(dst, r12_heapbase);
5462 }
5463 }
5464 } else {
5465 assert (CompressedOops::base() == NULL, "sanity");
5466 if (dst != src) {
5467 movq(dst, src);
5468 }
5469 }
5470}
5471
5472void MacroAssembler::encode_klass_not_null(Register r) {
5473 if (CompressedKlassPointers::base() != NULL) {
5474 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5475 assert(r != r12_heapbase, "Encoding a klass in r12");
5476 mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5477 subq(r, r12_heapbase);
5478 }
5479 if (CompressedKlassPointers::shift() != 0) {
5480 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5481 shrq(r, LogKlassAlignmentInBytes);
5482 }
5483 if (CompressedKlassPointers::base() != NULL) {
5484 reinit_heapbase();
5485 }
5486}
5487
5488void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5489 if (dst == src) {
5490 encode_klass_not_null(src);
5491 } else {
5492 if (CompressedKlassPointers::base() != NULL) {
5493 mov64(dst, (int64_t)CompressedKlassPointers::base());
5494 negq(dst);
5495 addq(dst, src);
5496 } else {
5497 movptr(dst, src);
5498 }
5499 if (CompressedKlassPointers::shift() != 0) {
5500 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5501 shrq(dst, LogKlassAlignmentInBytes);
5502 }
5503 }
5504}
5505
5506// Function instr_size_for_decode_klass_not_null() counts the instructions
5507// generated by decode_klass_not_null(register r) and reinit_heapbase(),
5508// when (Universe::heap() != NULL). Hence, if the instructions they
5509// generate change, then this method needs to be updated.
5510int MacroAssembler::instr_size_for_decode_klass_not_null() {
5511 assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5512 if (CompressedKlassPointers::base() != NULL) {
5513 // mov64 + addq + shlq? + mov64 (for reinit_heapbase()).
5514 return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5515 } else {
5516 // longest load decode klass function, mov64, leaq
5517 return 16;
5518 }
5519}
5520
5521// !!! If the instructions that get generated here change then function
5522// instr_size_for_decode_klass_not_null() needs to get updated.
5523void MacroAssembler::decode_klass_not_null(Register r) {
5524 // Note: it will change flags
5525 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5526 assert(r != r12_heapbase, "Decoding a klass in r12");
5527 // Cannot assert, unverified entry point counts instructions (see .ad file)
5528 // vtableStubs also counts instructions in pd_code_size_limit.
5529 // Also do not verify_oop as this is called by verify_oop.
5530 if (CompressedKlassPointers::shift() != 0) {
5531 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5532 shlq(r, LogKlassAlignmentInBytes);
5533 }
5534 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5535 if (CompressedKlassPointers::base() != NULL) {
5536 mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5537 addq(r, r12_heapbase);
5538 reinit_heapbase();
5539 }
5540}
5541
5542void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5543 // Note: it will change flags
5544 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5545 if (dst == src) {
5546 decode_klass_not_null(dst);
5547 } else {
5548 // Cannot assert, unverified entry point counts instructions (see .ad file)
5549 // vtableStubs also counts instructions in pd_code_size_limit.
5550 // Also do not verify_oop as this is called by verify_oop.
5551 mov64(dst, (int64_t)CompressedKlassPointers::base());
5552 if (CompressedKlassPointers::shift() != 0) {
5553 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5554 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5555 leaq(dst, Address(dst, src, Address::times_8, 0));
5556 } else {
5557 addq(dst, src);
5558 }
5559 }
5560}
5561
5562void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5563 assert (UseCompressedOops, "should only be used for compressed headers");
5564 assert (Universe::heap() != NULL, "java heap should be initialized");
5565 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5566 int oop_index = oop_recorder()->find_index(obj);
5567 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5568 mov_narrow_oop(dst, oop_index, rspec);
5569}
5570
5571void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5572 assert (UseCompressedOops, "should only be used for compressed headers");
5573 assert (Universe::heap() != NULL, "java heap should be initialized");
5574 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5575 int oop_index = oop_recorder()->find_index(obj);
5576 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5577 mov_narrow_oop(dst, oop_index, rspec);
5578}
5579
5580void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5581 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5582 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5583 int klass_index = oop_recorder()->find_index(k);
5584 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5585 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5586}
5587
5588void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5589 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5590 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5591 int klass_index = oop_recorder()->find_index(k);
5592 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5593 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5594}
5595
5596void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5597 assert (UseCompressedOops, "should only be used for compressed headers");
5598 assert (Universe::heap() != NULL, "java heap should be initialized");
5599 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5600 int oop_index = oop_recorder()->find_index(obj);
5601 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5602 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5603}
5604
5605void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5606 assert (UseCompressedOops, "should only be used for compressed headers");
5607 assert (Universe::heap() != NULL, "java heap should be initialized");
5608 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5609 int oop_index = oop_recorder()->find_index(obj);
5610 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5611 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5612}
5613
5614void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5615 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5616 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5617 int klass_index = oop_recorder()->find_index(k);
5618 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5619 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5620}
5621
5622void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5623 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5624 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5625 int klass_index = oop_recorder()->find_index(k);
5626 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5627 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5628}
5629
5630void MacroAssembler::reinit_heapbase() {
5631 if (UseCompressedOops || UseCompressedClassPointers) {
5632 if (Universe::heap() != NULL) {
5633 if (CompressedOops::base() == NULL) {
5634 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5635 } else {
5636 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5637 }
5638 } else {
5639 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5640 }
5641 }
5642}
5643
5644#endif // _LP64
5645
5646// C2 compiled method's prolog code.
5647void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5648
5649 // WARNING: Initial instruction MUST be 5 bytes or longer so that
5650 // NativeJump::patch_verified_entry will be able to patch out the entry
5651 // code safely. The push to verify stack depth is ok at 5 bytes,
5652 // the frame allocation can be either 3 or 6 bytes. So if we don't do
5653 // stack bang then we must use the 6 byte frame allocation even if
5654 // we have no frame. :-(
5655 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5656
5657 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5658 // Remove word for return addr
5659 framesize -= wordSize;
5660 stack_bang_size -= wordSize;
5661
5662 // Calls to C2R adapters often do not accept exceptional returns.
5663 // We require that their callers must bang for them. But be careful, because
5664 // some VM calls (such as call site linkage) can use several kilobytes of
5665 // stack. But the stack safety zone should account for that.
5666 // See bugs 4446381, 4468289, 4497237.
5667 if (stack_bang_size > 0) {
5668 generate_stack_overflow_check(stack_bang_size);
5669
5670 // We always push rbp, so that on return to interpreter rbp, will be
5671 // restored correctly and we can correct the stack.
5672 push(rbp);
5673 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5674 if (PreserveFramePointer) {
5675 mov(rbp, rsp);
5676 }
5677 // Remove word for ebp
5678 framesize -= wordSize;
5679
5680 // Create frame
5681 if (framesize) {
5682 subptr(rsp, framesize);
5683 }
5684 } else {
5685 // Create frame (force generation of a 4 byte immediate value)
5686 subptr_imm32(rsp, framesize);
5687
5688 // Save RBP register now.
5689 framesize -= wordSize;
5690 movptr(Address(rsp, framesize), rbp);
5691 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5692 if (PreserveFramePointer) {
5693 movptr(rbp, rsp);
5694 if (framesize > 0) {
5695 addptr(rbp, framesize);
5696 }
5697 }
5698 }
5699
5700 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5701 framesize -= wordSize;
5702 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5703 }
5704
5705#ifndef _LP64
5706 // If method sets FPU control word do it now
5707 if (fp_mode_24b) {
5708 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5709 }
5710 if (UseSSE >= 2 && VerifyFPU) {
5711 verify_FPU(0, "FPU stack must be clean on entry");
5712 }
5713#endif
5714
5715#ifdef ASSERT
5716 if (VerifyStackAtCalls) {
5717 Label L;
5718 push(rax);
5719 mov(rax, rsp);
5720 andptr(rax, StackAlignmentInBytes-1);
5721 cmpptr(rax, StackAlignmentInBytes-wordSize);
5722 pop(rax);
5723 jcc(Assembler::equal, L);
5724 STOP("Stack is not properly aligned!");
5725 bind(L);
5726 }
5727#endif
5728
5729 if (!is_stub) {
5730 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5731 bs->nmethod_entry_barrier(this);
5732 }
5733}
5734
5735// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5736void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
5737 // cnt - number of qwords (8-byte words).
5738 // base - start address, qword aligned.
5739 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5740 if (UseAVX >= 2) {
5741 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5742 } else {
5743 pxor(xtmp, xtmp);
5744 }
5745 jmp(L_zero_64_bytes);
5746
5747 BIND(L_loop);
5748 if (UseAVX >= 2) {
5749 vmovdqu(Address(base, 0), xtmp);
5750 vmovdqu(Address(base, 32), xtmp);
5751 } else {
5752 movdqu(Address(base, 0), xtmp);
5753 movdqu(Address(base, 16), xtmp);
5754 movdqu(Address(base, 32), xtmp);
5755 movdqu(Address(base, 48), xtmp);
5756 }
5757 addptr(base, 64);
5758
5759 BIND(L_zero_64_bytes);
5760 subptr(cnt, 8);
5761 jccb(Assembler::greaterEqual, L_loop);
5762 addptr(cnt, 4);
5763 jccb(Assembler::less, L_tail);
5764 // Copy trailing 32 bytes
5765 if (UseAVX >= 2) {
5766 vmovdqu(Address(base, 0), xtmp);
5767 } else {
5768 movdqu(Address(base, 0), xtmp);
5769 movdqu(Address(base, 16), xtmp);
5770 }
5771 addptr(base, 32);
5772 subptr(cnt, 4);
5773
5774 BIND(L_tail);
5775 addptr(cnt, 4);
5776 jccb(Assembler::lessEqual, L_end);
5777 decrement(cnt);
5778
5779 BIND(L_sloop);
5780 movq(Address(base, 0), xtmp);
5781 addptr(base, 8);
5782 decrement(cnt);
5783 jccb(Assembler::greaterEqual, L_sloop);
5784 BIND(L_end);
5785}
5786
5787void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5788 // cnt - number of qwords (8-byte words).
5789 // base - start address, qword aligned.
5790 // is_large - if optimizers know cnt is larger than InitArrayShortSize
5791 assert(base==rdi, "base register must be edi for rep stos");
5792 assert(tmp==rax, "tmp register must be eax for rep stos");
5793 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5794 assert(InitArrayShortSize % BytesPerLong == 0,
5795 "InitArrayShortSize should be the multiple of BytesPerLong");
5796
5797 Label DONE;
5798
5799 if (!is_large || !UseXMMForObjInit) {
5800 xorptr(tmp, tmp);
5801 }
5802
5803 if (!is_large) {
5804 Label LOOP, LONG;
5805 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5806 jccb(Assembler::greater, LONG);
5807
5808 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5809
5810 decrement(cnt);
5811 jccb(Assembler::negative, DONE); // Zero length
5812
5813 // Use individual pointer-sized stores for small counts:
5814 BIND(LOOP);
5815 movptr(Address(base, cnt, Address::times_ptr), tmp);
5816 decrement(cnt);
5817 jccb(Assembler::greaterEqual, LOOP);
5818 jmpb(DONE);
5819
5820 BIND(LONG);
5821 }
5822
5823 // Use longer rep-prefixed ops for non-small counts:
5824 if (UseFastStosb) {
5825 shlptr(cnt, 3); // convert to number of bytes
5826 rep_stosb();
5827 } else if (UseXMMForObjInit) {
5828 movptr(tmp, base);
5829 xmm_clear_mem(tmp, cnt, xtmp);
5830 } else {
5831 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5832 rep_stos();
5833 }
5834
5835 BIND(DONE);
5836}
5837
5838#ifdef COMPILER2
5839
5840// IndexOf for constant substrings with size >= 8 chars
5841// which don't need to be loaded through stack.
5842void MacroAssembler::string_indexofC8(Register str1, Register str2,
5843 Register cnt1, Register cnt2,
5844 int int_cnt2, Register result,
5845 XMMRegister vec, Register tmp,
5846 int ae) {
5847 ShortBranchVerifier sbv(this);
5848 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5849 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5850
5851 // This method uses the pcmpestri instruction with bound registers
5852 // inputs:
5853 // xmm - substring
5854 // rax - substring length (elements count)
5855 // mem - scanned string
5856 // rdx - string length (elements count)
5857 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5858 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5859 // outputs:
5860 // rcx - matched index in string
5861 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5862 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5863 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5864 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5865 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5866
5867 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5868 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5869 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5870
5871 // Note, inline_string_indexOf() generates checks:
5872 // if (substr.count > string.count) return -1;
5873 // if (substr.count == 0) return 0;
5874 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
5875
5876 // Load substring.
5877 if (ae == StrIntrinsicNode::UL) {
5878 pmovzxbw(vec, Address(str2, 0));
5879 } else {
5880 movdqu(vec, Address(str2, 0));
5881 }
5882 movl(cnt2, int_cnt2);
5883 movptr(result, str1); // string addr
5884
5885 if (int_cnt2 > stride) {
5886 jmpb(SCAN_TO_SUBSTR);
5887
5888 // Reload substr for rescan, this code
5889 // is executed only for large substrings (> 8 chars)
5890 bind(RELOAD_SUBSTR);
5891 if (ae == StrIntrinsicNode::UL) {
5892 pmovzxbw(vec, Address(str2, 0));
5893 } else {
5894 movdqu(vec, Address(str2, 0));
5895 }
5896 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5897
5898 bind(RELOAD_STR);
5899 // We came here after the beginning of the substring was
5900 // matched but the rest of it was not so we need to search
5901 // again. Start from the next element after the previous match.
5902
5903 // cnt2 is number of substring reminding elements and
5904 // cnt1 is number of string reminding elements when cmp failed.
5905 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5906 subl(cnt1, cnt2);
5907 addl(cnt1, int_cnt2);
5908 movl(cnt2, int_cnt2); // Now restore cnt2
5909
5910 decrementl(cnt1); // Shift to next element
5911 cmpl(cnt1, cnt2);
5912 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
5913
5914 addptr(result, (1<<scale1));
5915
5916 } // (int_cnt2 > 8)
5917
5918 // Scan string for start of substr in 16-byte vectors
5919 bind(SCAN_TO_SUBSTR);
5920 pcmpestri(vec, Address(result, 0), mode);
5921 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
5922 subl(cnt1, stride);
5923 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5924 cmpl(cnt1, cnt2);
5925 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
5926 addptr(result, 16);
5927 jmpb(SCAN_TO_SUBSTR);
5928
5929 // Found a potential substr
5930 bind(FOUND_CANDIDATE);
5931 // Matched whole vector if first element matched (tmp(rcx) == 0).
5932 if (int_cnt2 == stride) {
5933 jccb(Assembler::overflow, RET_FOUND); // OF == 1
5934 } else { // int_cnt2 > 8
5935 jccb(Assembler::overflow, FOUND_SUBSTR);
5936 }
5937 // After pcmpestri tmp(rcx) contains matched element index
5938 // Compute start addr of substr
5939 lea(result, Address(result, tmp, scale1));
5940
5941 // Make sure string is still long enough
5942 subl(cnt1, tmp);
5943 cmpl(cnt1, cnt2);
5944 if (int_cnt2 == stride) {
5945 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
5946 } else { // int_cnt2 > 8
5947 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
5948 }
5949 // Left less then substring.
5950
5951 bind(RET_NOT_FOUND);
5952 movl(result, -1);
5953 jmp(EXIT);
5954
5955 if (int_cnt2 > stride) {
5956 // This code is optimized for the case when whole substring
5957 // is matched if its head is matched.
5958 bind(MATCH_SUBSTR_HEAD);
5959 pcmpestri(vec, Address(result, 0), mode);
5960 // Reload only string if does not match
5961 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
5962
5963 Label CONT_SCAN_SUBSTR;
5964 // Compare the rest of substring (> 8 chars).
5965 bind(FOUND_SUBSTR);
5966 // First 8 chars are already matched.
5967 negptr(cnt2);
5968 addptr(cnt2, stride);
5969
5970 bind(SCAN_SUBSTR);
5971 subl(cnt1, stride);
5972 cmpl(cnt2, -stride); // Do not read beyond substring
5973 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
5974 // Back-up strings to avoid reading beyond substring:
5975 // cnt1 = cnt1 - cnt2 + 8
5976 addl(cnt1, cnt2); // cnt2 is negative
5977 addl(cnt1, stride);
5978 movl(cnt2, stride); negptr(cnt2);
5979 bind(CONT_SCAN_SUBSTR);
5980 if (int_cnt2 < (int)G) {
5981 int tail_off1 = int_cnt2<<scale1;
5982 int tail_off2 = int_cnt2<<scale2;
5983 if (ae == StrIntrinsicNode::UL) {
5984 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
5985 } else {
5986 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
5987 }
5988 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
5989 } else {
5990 // calculate index in register to avoid integer overflow (int_cnt2*2)
5991 movl(tmp, int_cnt2);
5992 addptr(tmp, cnt2);
5993 if (ae == StrIntrinsicNode::UL) {
5994 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
5995 } else {
5996 movdqu(vec, Address(str2, tmp, scale2, 0));
5997 }
5998 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
5999 }
6000 // Need to reload strings pointers if not matched whole vector
6001 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6002 addptr(cnt2, stride);
6003 jcc(Assembler::negative, SCAN_SUBSTR);
6004 // Fall through if found full substring
6005
6006 } // (int_cnt2 > 8)
6007
6008 bind(RET_FOUND);
6009 // Found result if we matched full small substring.
6010 // Compute substr offset
6011 subptr(result, str1);
6012 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6013 shrl(result, 1); // index
6014 }
6015 bind(EXIT);
6016
6017} // string_indexofC8
6018
6019// Small strings are loaded through stack if they cross page boundary.
6020void MacroAssembler::string_indexof(Register str1, Register str2,
6021 Register cnt1, Register cnt2,
6022 int int_cnt2, Register result,
6023 XMMRegister vec, Register tmp,
6024 int ae) {
6025 ShortBranchVerifier sbv(this);
6026 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6027 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6028
6029 //
6030 // int_cnt2 is length of small (< 8 chars) constant substring
6031 // or (-1) for non constant substring in which case its length
6032 // is in cnt2 register.
6033 //
6034 // Note, inline_string_indexOf() generates checks:
6035 // if (substr.count > string.count) return -1;
6036 // if (substr.count == 0) return 0;
6037 //
6038 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6039 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6040 // This method uses the pcmpestri instruction with bound registers
6041 // inputs:
6042 // xmm - substring
6043 // rax - substring length (elements count)
6044 // mem - scanned string
6045 // rdx - string length (elements count)
6046 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6047 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6048 // outputs:
6049 // rcx - matched index in string
6050 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6051 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6052 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6053 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6054
6055 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6056 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6057 FOUND_CANDIDATE;
6058
6059 { //========================================================
6060 // We don't know where these strings are located
6061 // and we can't read beyond them. Load them through stack.
6062 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6063
6064 movptr(tmp, rsp); // save old SP
6065
6066 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
6067 if (int_cnt2 == (1>>scale2)) { // One byte
6068 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6069 load_unsigned_byte(result, Address(str2, 0));
6070 movdl(vec, result); // move 32 bits
6071 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
6072 // Not enough header space in 32-bit VM: 12+3 = 15.
6073 movl(result, Address(str2, -1));
6074 shrl(result, 8);
6075 movdl(vec, result); // move 32 bits
6076 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
6077 load_unsigned_short(result, Address(str2, 0));
6078 movdl(vec, result); // move 32 bits
6079 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6080 movdl(vec, Address(str2, 0)); // move 32 bits
6081 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6082 movq(vec, Address(str2, 0)); // move 64 bits
6083 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6084 // Array header size is 12 bytes in 32-bit VM
6085 // + 6 bytes for 3 chars == 18 bytes,
6086 // enough space to load vec and shift.
6087 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6088 if (ae == StrIntrinsicNode::UL) {
6089 int tail_off = int_cnt2-8;
6090 pmovzxbw(vec, Address(str2, tail_off));
6091 psrldq(vec, -2*tail_off);
6092 }
6093 else {
6094 int tail_off = int_cnt2*(1<<scale2);
6095 movdqu(vec, Address(str2, tail_off-16));
6096 psrldq(vec, 16-tail_off);
6097 }
6098 }
6099 } else { // not constant substring
6100 cmpl(cnt2, stride);
6101 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6102
6103 // We can read beyond string if srt+16 does not cross page boundary
6104 // since heaps are aligned and mapped by pages.
6105 assert(os::vm_page_size() < (int)G, "default page should be small");
6106 movl(result, str2); // We need only low 32 bits
6107 andl(result, (os::vm_page_size()-1));
6108 cmpl(result, (os::vm_page_size()-16));
6109 jccb(Assembler::belowEqual, CHECK_STR);
6110
6111 // Move small strings to stack to allow load 16 bytes into vec.
6112 subptr(rsp, 16);
6113 int stk_offset = wordSize-(1<<scale2);
6114 push(cnt2);
6115
6116 bind(COPY_SUBSTR);
6117 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6118 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6119 movb(Address(rsp, cnt2, scale2, stk_offset), result);
6120 } else if (ae == StrIntrinsicNode::UU) {
6121 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6122 movw(Address(rsp, cnt2, scale2, stk_offset), result);
6123 }
6124 decrement(cnt2);
6125 jccb(Assembler::notZero, COPY_SUBSTR);
6126
6127 pop(cnt2);
6128 movptr(str2, rsp); // New substring address
6129 } // non constant
6130
6131 bind(CHECK_STR);
6132 cmpl(cnt1, stride);
6133 jccb(Assembler::aboveEqual, BIG_STRINGS);
6134
6135 // Check cross page boundary.
6136 movl(result, str1); // We need only low 32 bits
6137 andl(result, (os::vm_page_size()-1));
6138 cmpl(result, (os::vm_page_size()-16));
6139 jccb(Assembler::belowEqual, BIG_STRINGS);
6140
6141 subptr(rsp, 16);
6142 int stk_offset = -(1<<scale1);
6143 if (int_cnt2 < 0) { // not constant
6144 push(cnt2);
6145 stk_offset += wordSize;
6146 }
6147 movl(cnt2, cnt1);
6148
6149 bind(COPY_STR);
6150 if (ae == StrIntrinsicNode::LL) {
6151 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6152 movb(Address(rsp, cnt2, scale1, stk_offset), result);
6153 } else {
6154 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6155 movw(Address(rsp, cnt2, scale1, stk_offset), result);
6156 }
6157 decrement(cnt2);
6158 jccb(Assembler::notZero, COPY_STR);
6159
6160 if (int_cnt2 < 0) { // not constant
6161 pop(cnt2);
6162 }
6163 movptr(str1, rsp); // New string address
6164
6165 bind(BIG_STRINGS);
6166 // Load substring.
6167 if (int_cnt2 < 0) { // -1
6168 if (ae == StrIntrinsicNode::UL) {
6169 pmovzxbw(vec, Address(str2, 0));
6170 } else {
6171 movdqu(vec, Address(str2, 0));
6172 }
6173 push(cnt2); // substr count
6174 push(str2); // substr addr
6175 push(str1); // string addr
6176 } else {
6177 // Small (< 8 chars) constant substrings are loaded already.
6178 movl(cnt2, int_cnt2);
6179 }
6180 push(tmp); // original SP
6181
6182 } // Finished loading
6183
6184 //========================================================
6185 // Start search
6186 //
6187
6188 movptr(result, str1); // string addr
6189
6190 if (int_cnt2 < 0) { // Only for non constant substring
6191 jmpb(SCAN_TO_SUBSTR);
6192
6193 // SP saved at sp+0
6194 // String saved at sp+1*wordSize
6195 // Substr saved at sp+2*wordSize
6196 // Substr count saved at sp+3*wordSize
6197
6198 // Reload substr for rescan, this code
6199 // is executed only for large substrings (> 8 chars)
6200 bind(RELOAD_SUBSTR);
6201 movptr(str2, Address(rsp, 2*wordSize));
6202 movl(cnt2, Address(rsp, 3*wordSize));
6203 if (ae == StrIntrinsicNode::UL) {
6204 pmovzxbw(vec, Address(str2, 0));
6205 } else {
6206 movdqu(vec, Address(str2, 0));
6207 }
6208 // We came here after the beginning of the substring was
6209 // matched but the rest of it was not so we need to search
6210 // again. Start from the next element after the previous match.
6211 subptr(str1, result); // Restore counter
6212 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6213 shrl(str1, 1);
6214 }
6215 addl(cnt1, str1);
6216 decrementl(cnt1); // Shift to next element
6217 cmpl(cnt1, cnt2);
6218 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6219
6220 addptr(result, (1<<scale1));
6221 } // non constant
6222
6223 // Scan string for start of substr in 16-byte vectors
6224 bind(SCAN_TO_SUBSTR);
6225 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6226 pcmpestri(vec, Address(result, 0), mode);
6227 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
6228 subl(cnt1, stride);
6229 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6230 cmpl(cnt1, cnt2);
6231 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6232 addptr(result, 16);
6233
6234 bind(ADJUST_STR);
6235 cmpl(cnt1, stride); // Do not read beyond string
6236 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6237 // Back-up string to avoid reading beyond string.
6238 lea(result, Address(result, cnt1, scale1, -16));
6239 movl(cnt1, stride);
6240 jmpb(SCAN_TO_SUBSTR);
6241
6242 // Found a potential substr
6243 bind(FOUND_CANDIDATE);
6244 // After pcmpestri tmp(rcx) contains matched element index
6245
6246 // Make sure string is still long enough
6247 subl(cnt1, tmp);
6248 cmpl(cnt1, cnt2);
6249 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6250 // Left less then substring.
6251
6252 bind(RET_NOT_FOUND);
6253 movl(result, -1);
6254 jmp(CLEANUP);
6255
6256 bind(FOUND_SUBSTR);
6257 // Compute start addr of substr
6258 lea(result, Address(result, tmp, scale1));
6259 if (int_cnt2 > 0) { // Constant substring
6260 // Repeat search for small substring (< 8 chars)
6261 // from new point without reloading substring.
6262 // Have to check that we don't read beyond string.
6263 cmpl(tmp, stride-int_cnt2);
6264 jccb(Assembler::greater, ADJUST_STR);
6265 // Fall through if matched whole substring.
6266 } else { // non constant
6267 assert(int_cnt2 == -1, "should be != 0");
6268
6269 addl(tmp, cnt2);
6270 // Found result if we matched whole substring.
6271 cmpl(tmp, stride);
6272 jcc(Assembler::lessEqual, RET_FOUND);
6273
6274 // Repeat search for small substring (<= 8 chars)
6275 // from new point 'str1' without reloading substring.
6276 cmpl(cnt2, stride);
6277 // Have to check that we don't read beyond string.
6278 jccb(Assembler::lessEqual, ADJUST_STR);
6279
6280 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6281 // Compare the rest of substring (> 8 chars).
6282 movptr(str1, result);
6283
6284 cmpl(tmp, cnt2);
6285 // First 8 chars are already matched.
6286 jccb(Assembler::equal, CHECK_NEXT);
6287
6288 bind(SCAN_SUBSTR);
6289 pcmpestri(vec, Address(str1, 0), mode);
6290 // Need to reload strings pointers if not matched whole vector
6291 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6292
6293 bind(CHECK_NEXT);
6294 subl(cnt2, stride);
6295 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6296 addptr(str1, 16);
6297 if (ae == StrIntrinsicNode::UL) {
6298 addptr(str2, 8);
6299 } else {
6300 addptr(str2, 16);
6301 }
6302 subl(cnt1, stride);
6303 cmpl(cnt2, stride); // Do not read beyond substring
6304 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6305 // Back-up strings to avoid reading beyond substring.
6306
6307 if (ae == StrIntrinsicNode::UL) {
6308 lea(str2, Address(str2, cnt2, scale2, -8));
6309 lea(str1, Address(str1, cnt2, scale1, -16));
6310 } else {
6311 lea(str2, Address(str2, cnt2, scale2, -16));
6312 lea(str1, Address(str1, cnt2, scale1, -16));
6313 }
6314 subl(cnt1, cnt2);
6315 movl(cnt2, stride);
6316 addl(cnt1, stride);
6317 bind(CONT_SCAN_SUBSTR);
6318 if (ae == StrIntrinsicNode::UL) {
6319 pmovzxbw(vec, Address(str2, 0));
6320 } else {
6321 movdqu(vec, Address(str2, 0));
6322 }
6323 jmp(SCAN_SUBSTR);
6324
6325 bind(RET_FOUND_LONG);
6326 movptr(str1, Address(rsp, wordSize));
6327 } // non constant
6328
6329 bind(RET_FOUND);
6330 // Compute substr offset
6331 subptr(result, str1);
6332 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6333 shrl(result, 1); // index
6334 }
6335 bind(CLEANUP);
6336 pop(rsp); // restore SP
6337
6338} // string_indexof
6339
6340void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6341 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6342 ShortBranchVerifier sbv(this);
6343 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6344
6345 int stride = 8;
6346
6347 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6348 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6349 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6350 FOUND_SEQ_CHAR, DONE_LABEL;
6351
6352 movptr(result, str1);
6353 if (UseAVX >= 2) {
6354 cmpl(cnt1, stride);
6355 jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6356 cmpl(cnt1, 2*stride);
6357 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6358 movdl(vec1, ch);
6359 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6360 vpxor(vec2, vec2);
6361 movl(tmp, cnt1);
6362 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
6363 andl(cnt1,0x0000000F); //tail count (in chars)
6364
6365 bind(SCAN_TO_16_CHAR_LOOP);
6366 vmovdqu(vec3, Address(result, 0));
6367 vpcmpeqw(vec3, vec3, vec1, 1);
6368 vptest(vec2, vec3);
6369 jcc(Assembler::carryClear, FOUND_CHAR);
6370 addptr(result, 32);
6371 subl(tmp, 2*stride);
6372 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6373 jmp(SCAN_TO_8_CHAR);
6374 bind(SCAN_TO_8_CHAR_INIT);
6375 movdl(vec1, ch);
6376 pshuflw(vec1, vec1, 0x00);
6377 pshufd(vec1, vec1, 0);
6378 pxor(vec2, vec2);
6379 }
6380 bind(SCAN_TO_8_CHAR);
6381 cmpl(cnt1, stride);
6382 if (UseAVX >= 2) {
6383 jcc(Assembler::less, SCAN_TO_CHAR);
6384 } else {
6385 jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6386 movdl(vec1, ch);
6387 pshuflw(vec1, vec1, 0x00);
6388 pshufd(vec1, vec1, 0);
6389 pxor(vec2, vec2);
6390 }
6391 movl(tmp, cnt1);
6392 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
6393 andl(cnt1,0x00000007); //tail count (in chars)
6394
6395 bind(SCAN_TO_8_CHAR_LOOP);
6396 movdqu(vec3, Address(result, 0));
6397 pcmpeqw(vec3, vec1);
6398 ptest(vec2, vec3);
6399 jcc(Assembler::carryClear, FOUND_CHAR);
6400 addptr(result, 16);
6401 subl(tmp, stride);
6402 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6403 bind(SCAN_TO_CHAR);
6404 testl(cnt1, cnt1);
6405 jcc(Assembler::zero, RET_NOT_FOUND);
6406 bind(SCAN_TO_CHAR_LOOP);
6407 load_unsigned_short(tmp, Address(result, 0));
6408 cmpl(ch, tmp);
6409 jccb(Assembler::equal, FOUND_SEQ_CHAR);
6410 addptr(result, 2);
6411 subl(cnt1, 1);
6412 jccb(Assembler::zero, RET_NOT_FOUND);
6413 jmp(SCAN_TO_CHAR_LOOP);
6414
6415 bind(RET_NOT_FOUND);
6416 movl(result, -1);
6417 jmpb(DONE_LABEL);
6418
6419 bind(FOUND_CHAR);
6420 if (UseAVX >= 2) {
6421 vpmovmskb(tmp, vec3);
6422 } else {
6423 pmovmskb(tmp, vec3);
6424 }
6425 bsfl(ch, tmp);
6426 addl(result, ch);
6427
6428 bind(FOUND_SEQ_CHAR);
6429 subptr(result, str1);
6430 shrl(result, 1);
6431
6432 bind(DONE_LABEL);
6433} // string_indexof_char
6434
6435// helper function for string_compare
6436void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6437 Address::ScaleFactor scale, Address::ScaleFactor scale1,
6438 Address::ScaleFactor scale2, Register index, int ae) {
6439 if (ae == StrIntrinsicNode::LL) {
6440 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6441 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6442 } else if (ae == StrIntrinsicNode::UU) {
6443 load_unsigned_short(elem1, Address(str1, index, scale, 0));
6444 load_unsigned_short(elem2, Address(str2, index, scale, 0));
6445 } else {
6446 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6447 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6448 }
6449}
6450
6451// Compare strings, used for char[] and byte[].
6452void MacroAssembler::string_compare(Register str1, Register str2,
6453 Register cnt1, Register cnt2, Register result,
6454 XMMRegister vec1, int ae) {
6455 ShortBranchVerifier sbv(this);
6456 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6457 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
6458 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6459 int stride2x2 = 0x40;
6460 Address::ScaleFactor scale = Address::no_scale;
6461 Address::ScaleFactor scale1 = Address::no_scale;
6462 Address::ScaleFactor scale2 = Address::no_scale;
6463
6464 if (ae != StrIntrinsicNode::LL) {
6465 stride2x2 = 0x20;
6466 }
6467
6468 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6469 shrl(cnt2, 1);
6470 }
6471 // Compute the minimum of the string lengths and the
6472 // difference of the string lengths (stack).
6473 // Do the conditional move stuff
6474 movl(result, cnt1);
6475 subl(cnt1, cnt2);
6476 push(cnt1);
6477 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
6478
6479 // Is the minimum length zero?
6480 testl(cnt2, cnt2);
6481 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6482 if (ae == StrIntrinsicNode::LL) {
6483 // Load first bytes
6484 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
6485 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
6486 } else if (ae == StrIntrinsicNode::UU) {
6487 // Load first characters
6488 load_unsigned_short(result, Address(str1, 0));
6489 load_unsigned_short(cnt1, Address(str2, 0));
6490 } else {
6491 load_unsigned_byte(result, Address(str1, 0));
6492 load_unsigned_short(cnt1, Address(str2, 0));
6493 }
6494 subl(result, cnt1);
6495 jcc(Assembler::notZero, POP_LABEL);
6496
6497 if (ae == StrIntrinsicNode::UU) {
6498 // Divide length by 2 to get number of chars
6499 shrl(cnt2, 1);
6500 }
6501 cmpl(cnt2, 1);
6502 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6503
6504 // Check if the strings start at the same location and setup scale and stride
6505 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6506 cmpptr(str1, str2);
6507 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6508 if (ae == StrIntrinsicNode::LL) {
6509 scale = Address::times_1;
6510 stride = 16;
6511 } else {
6512 scale = Address::times_2;
6513 stride = 8;
6514 }
6515 } else {
6516 scale1 = Address::times_1;
6517 scale2 = Address::times_2;
6518 // scale not used
6519 stride = 8;
6520 }
6521
6522 if (UseAVX >= 2 && UseSSE42Intrinsics) {
6523 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6524 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6525 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
6526 Label COMPARE_TAIL_LONG;
6527 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
6528
6529 int pcmpmask = 0x19;
6530 if (ae == StrIntrinsicNode::LL) {
6531 pcmpmask &= ~0x01;
6532 }
6533
6534 // Setup to compare 16-chars (32-bytes) vectors,
6535 // start from first character again because it has aligned address.
6536 if (ae == StrIntrinsicNode::LL) {
6537 stride2 = 32;
6538 } else {
6539 stride2 = 16;
6540 }
6541 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6542 adr_stride = stride << scale;
6543 } else {
6544 adr_stride1 = 8; //stride << scale1;
6545 adr_stride2 = 16; //stride << scale2;
6546 }
6547
6548 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6549 // rax and rdx are used by pcmpestri as elements counters
6550 movl(result, cnt2);
6551 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
6552 jcc(Assembler::zero, COMPARE_TAIL_LONG);
6553
6554 // fast path : compare first 2 8-char vectors.
6555 bind(COMPARE_16_CHARS);
6556 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6557 movdqu(vec1, Address(str1, 0));
6558 } else {
6559 pmovzxbw(vec1, Address(str1, 0));
6560 }
6561 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6562 jccb(Assembler::below, COMPARE_INDEX_CHAR);
6563
6564 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6565 movdqu(vec1, Address(str1, adr_stride));
6566 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6567 } else {
6568 pmovzxbw(vec1, Address(str1, adr_stride1));
6569 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
6570 }
6571 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6572 addl(cnt1, stride);
6573
6574 // Compare the characters at index in cnt1
6575 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
6576 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6577 subl(result, cnt2);
6578 jmp(POP_LABEL);
6579
6580 // Setup the registers to start vector comparison loop
6581 bind(COMPARE_WIDE_VECTORS);
6582 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6583 lea(str1, Address(str1, result, scale));
6584 lea(str2, Address(str2, result, scale));
6585 } else {
6586 lea(str1, Address(str1, result, scale1));
6587 lea(str2, Address(str2, result, scale2));
6588 }
6589 subl(result, stride2);
6590 subl(cnt2, stride2);
6591 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
6592 negptr(result);
6593
6594 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6595 bind(COMPARE_WIDE_VECTORS_LOOP);
6596
6597#ifdef _LP64
6598 if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6599 cmpl(cnt2, stride2x2);
6600 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6601 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
6602 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
6603
6604 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6605 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6606 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
6607 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6608 } else {
6609 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
6610 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6611 }
6612 kortestql(k7, k7);
6613 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
6614 addptr(result, stride2x2); // update since we already compared at this addr
6615 subl(cnt2, stride2x2); // and sub the size too
6616 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6617
6618 vpxor(vec1, vec1);
6619 jmpb(COMPARE_WIDE_TAIL);
6620 }//if (VM_Version::supports_avx512vlbw())
6621#endif // _LP64
6622
6623
6624 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6625 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6626 vmovdqu(vec1, Address(str1, result, scale));
6627 vpxor(vec1, Address(str2, result, scale));
6628 } else {
6629 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
6630 vpxor(vec1, Address(str2, result, scale2));
6631 }
6632 vptest(vec1, vec1);
6633 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
6634 addptr(result, stride2);
6635 subl(cnt2, stride2);
6636 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6637 // clean upper bits of YMM registers
6638 vpxor(vec1, vec1);
6639
6640 // compare wide vectors tail
6641 bind(COMPARE_WIDE_TAIL);
6642 testptr(result, result);
6643 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6644
6645 movl(result, stride2);
6646 movl(cnt2, result);
6647 negptr(result);
6648 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6649
6650 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6651 bind(VECTOR_NOT_EQUAL);
6652 // clean upper bits of YMM registers
6653 vpxor(vec1, vec1);
6654 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6655 lea(str1, Address(str1, result, scale));
6656 lea(str2, Address(str2, result, scale));
6657 } else {
6658 lea(str1, Address(str1, result, scale1));
6659 lea(str2, Address(str2, result, scale2));
6660 }
6661 jmp(COMPARE_16_CHARS);
6662
6663 // Compare tail chars, length between 1 to 15 chars
6664 bind(COMPARE_TAIL_LONG);
6665 movl(cnt2, result);
6666 cmpl(cnt2, stride);
6667 jcc(Assembler::less, COMPARE_SMALL_STR);
6668
6669 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6670 movdqu(vec1, Address(str1, 0));
6671 } else {
6672 pmovzxbw(vec1, Address(str1, 0));
6673 }
6674 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6675 jcc(Assembler::below, COMPARE_INDEX_CHAR);
6676 subptr(cnt2, stride);
6677 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6679 lea(str1, Address(str1, result, scale));
6680 lea(str2, Address(str2, result, scale));
6681 } else {
6682 lea(str1, Address(str1, result, scale1));
6683 lea(str2, Address(str2, result, scale2));
6684 }
6685 negptr(cnt2);
6686 jmpb(WHILE_HEAD_LABEL);
6687
6688 bind(COMPARE_SMALL_STR);
6689 } else if (UseSSE42Intrinsics) {
6690 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6691 int pcmpmask = 0x19;
6692 // Setup to compare 8-char (16-byte) vectors,
6693 // start from first character again because it has aligned address.
6694 movl(result, cnt2);
6695 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
6696 if (ae == StrIntrinsicNode::LL) {
6697 pcmpmask &= ~0x01;
6698 }
6699 jcc(Assembler::zero, COMPARE_TAIL);
6700 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6701 lea(str1, Address(str1, result, scale));
6702 lea(str2, Address(str2, result, scale));
6703 } else {
6704 lea(str1, Address(str1, result, scale1));
6705 lea(str2, Address(str2, result, scale2));
6706 }
6707 negptr(result);
6708
6709 // pcmpestri
6710 // inputs:
6711 // vec1- substring
6712 // rax - negative string length (elements count)
6713 // mem - scanned string
6714 // rdx - string length (elements count)
6715 // pcmpmask - cmp mode: 11000 (string compare with negated result)
6716 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
6717 // outputs:
6718 // rcx - first mismatched element index
6719 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6720
6721 bind(COMPARE_WIDE_VECTORS);
6722 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6723 movdqu(vec1, Address(str1, result, scale));
6724 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6725 } else {
6726 pmovzxbw(vec1, Address(str1, result, scale1));
6727 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6728 }
6729 // After pcmpestri cnt1(rcx) contains mismatched element index
6730
6731 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
6732 addptr(result, stride);
6733 subptr(cnt2, stride);
6734 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6735
6736 // compare wide vectors tail
6737 testptr(result, result);
6738 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6739
6740 movl(cnt2, stride);
6741 movl(result, stride);
6742 negptr(result);
6743 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6744 movdqu(vec1, Address(str1, result, scale));
6745 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6746 } else {
6747 pmovzxbw(vec1, Address(str1, result, scale1));
6748 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6749 }
6750 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6751
6752 // Mismatched characters in the vectors
6753 bind(VECTOR_NOT_EQUAL);
6754 addptr(cnt1, result);
6755 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6756 subl(result, cnt2);
6757 jmpb(POP_LABEL);
6758
6759 bind(COMPARE_TAIL); // limit is zero
6760 movl(cnt2, result);
6761 // Fallthru to tail compare
6762 }
6763 // Shift str2 and str1 to the end of the arrays, negate min
6764 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6765 lea(str1, Address(str1, cnt2, scale));
6766 lea(str2, Address(str2, cnt2, scale));
6767 } else {
6768 lea(str1, Address(str1, cnt2, scale1));
6769 lea(str2, Address(str2, cnt2, scale2));
6770 }
6771 decrementl(cnt2); // first character was compared already
6772 negptr(cnt2);
6773
6774 // Compare the rest of the elements
6775 bind(WHILE_HEAD_LABEL);
6776 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
6777 subl(result, cnt1);
6778 jccb(Assembler::notZero, POP_LABEL);
6779 increment(cnt2);
6780 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6781
6782 // Strings are equal up to min length. Return the length difference.
6783 bind(LENGTH_DIFF_LABEL);
6784 pop(result);
6785 if (ae == StrIntrinsicNode::UU) {
6786 // Divide diff by 2 to get number of chars
6787 sarl(result, 1);
6788 }
6789 jmpb(DONE_LABEL);
6790
6791#ifdef _LP64
6792 if (VM_Version::supports_avx512vlbw()) {
6793
6794 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
6795
6796 kmovql(cnt1, k7);
6797 notq(cnt1);
6798 bsfq(cnt2, cnt1);
6799 if (ae != StrIntrinsicNode::LL) {
6800 // Divide diff by 2 to get number of chars
6801 sarl(cnt2, 1);
6802 }
6803 addq(result, cnt2);
6804 if (ae == StrIntrinsicNode::LL) {
6805 load_unsigned_byte(cnt1, Address(str2, result));
6806 load_unsigned_byte(result, Address(str1, result));
6807 } else if (ae == StrIntrinsicNode::UU) {
6808 load_unsigned_short(cnt1, Address(str2, result, scale));
6809 load_unsigned_short(result, Address(str1, result, scale));
6810 } else {
6811 load_unsigned_short(cnt1, Address(str2, result, scale2));
6812 load_unsigned_byte(result, Address(str1, result, scale1));
6813 }
6814 subl(result, cnt1);
6815 jmpb(POP_LABEL);
6816 }//if (VM_Version::supports_avx512vlbw())
6817#endif // _LP64
6818
6819 // Discard the stored length difference
6820 bind(POP_LABEL);
6821 pop(cnt1);
6822
6823 // That's it
6824 bind(DONE_LABEL);
6825 if(ae == StrIntrinsicNode::UL) {
6826 negl(result);
6827 }
6828
6829}
6830
6831// Search for Non-ASCII character (Negative byte value) in a byte array,
6832// return true if it has any and false otherwise.
6833// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
6834// @HotSpotIntrinsicCandidate
6835// private static boolean hasNegatives(byte[] ba, int off, int len) {
6836// for (int i = off; i < off + len; i++) {
6837// if (ba[i] < 0) {
6838// return true;
6839// }
6840// }
6841// return false;
6842// }
6843void MacroAssembler::has_negatives(Register ary1, Register len,
6844 Register result, Register tmp1,
6845 XMMRegister vec1, XMMRegister vec2) {
6846 // rsi: byte array
6847 // rcx: len
6848 // rax: result
6849 ShortBranchVerifier sbv(this);
6850 assert_different_registers(ary1, len, result, tmp1);
6851 assert_different_registers(vec1, vec2);
6852 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
6853
6854 // len == 0
6855 testl(len, len);
6856 jcc(Assembler::zero, FALSE_LABEL);
6857
6858 if ((UseAVX > 2) && // AVX512
6859 VM_Version::supports_avx512vlbw() &&
6860 VM_Version::supports_bmi2()) {
6861
6862 Label test_64_loop, test_tail;
6863 Register tmp3_aliased = len;
6864
6865 movl(tmp1, len);
6866 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
6867
6868 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
6869 andl(len, ~(64 - 1)); // vector count (in chars)
6870 jccb(Assembler::zero, test_tail);
6871
6872 lea(ary1, Address(ary1, len, Address::times_1));
6873 negptr(len);
6874
6875 bind(test_64_loop);
6876 // Check whether our 64 elements of size byte contain negatives
6877 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
6878 kortestql(k2, k2);
6879 jcc(Assembler::notZero, TRUE_LABEL);
6880
6881 addptr(len, 64);
6882 jccb(Assembler::notZero, test_64_loop);
6883
6884
6885 bind(test_tail);
6886 // bail out when there is nothing to be done
6887 testl(tmp1, -1);
6888 jcc(Assembler::zero, FALSE_LABEL);
6889
6890 // ~(~0 << len) applied up to two times (for 32-bit scenario)
6891#ifdef _LP64
6892 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
6893 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
6894 notq(tmp3_aliased);
6895 kmovql(k3, tmp3_aliased);
6896#else
6897 Label k_init;
6898 jmp(k_init);
6899
6900 // We could not read 64-bits from a general purpose register thus we move
6901 // data required to compose 64 1's to the instruction stream
6902 // We emit 64 byte wide series of elements from 0..63 which later on would
6903 // be used as a compare targets with tail count contained in tmp1 register.
6904 // Result would be a k register having tmp1 consecutive number or 1
6905 // counting from least significant bit.
6906 address tmp = pc();
6907 emit_int64(0x0706050403020100);
6908 emit_int64(0x0F0E0D0C0B0A0908);
6909 emit_int64(0x1716151413121110);
6910 emit_int64(0x1F1E1D1C1B1A1918);
6911 emit_int64(0x2726252423222120);
6912 emit_int64(0x2F2E2D2C2B2A2928);
6913 emit_int64(0x3736353433323130);
6914 emit_int64(0x3F3E3D3C3B3A3938);
6915
6916 bind(k_init);
6917 lea(len, InternalAddress(tmp));
6918 // create mask to test for negative byte inside a vector
6919 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
6920 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
6921
6922#endif
6923 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
6924 ktestq(k2, k3);
6925 jcc(Assembler::notZero, TRUE_LABEL);
6926
6927 jmp(FALSE_LABEL);
6928 } else {
6929 movl(result, len); // copy
6930
6931 if (UseAVX == 2 && UseSSE >= 2) {
6932 // With AVX2, use 32-byte vector compare
6933 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6934
6935 // Compare 32-byte vectors
6936 andl(result, 0x0000001f); // tail count (in bytes)
6937 andl(len, 0xffffffe0); // vector count (in bytes)
6938 jccb(Assembler::zero, COMPARE_TAIL);
6939
6940 lea(ary1, Address(ary1, len, Address::times_1));
6941 negptr(len);
6942
6943 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
6944 movdl(vec2, tmp1);
6945 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6946
6947 bind(COMPARE_WIDE_VECTORS);
6948 vmovdqu(vec1, Address(ary1, len, Address::times_1));
6949 vptest(vec1, vec2);
6950 jccb(Assembler::notZero, TRUE_LABEL);
6951 addptr(len, 32);
6952 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6953
6954 testl(result, result);
6955 jccb(Assembler::zero, FALSE_LABEL);
6956
6957 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6958 vptest(vec1, vec2);
6959 jccb(Assembler::notZero, TRUE_LABEL);
6960 jmpb(FALSE_LABEL);
6961
6962 bind(COMPARE_TAIL); // len is zero
6963 movl(len, result);
6964 // Fallthru to tail compare
6965 } else if (UseSSE42Intrinsics) {
6966 // With SSE4.2, use double quad vector compare
6967 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6968
6969 // Compare 16-byte vectors
6970 andl(result, 0x0000000f); // tail count (in bytes)
6971 andl(len, 0xfffffff0); // vector count (in bytes)
6972 jcc(Assembler::zero, COMPARE_TAIL);
6973
6974 lea(ary1, Address(ary1, len, Address::times_1));
6975 negptr(len);
6976
6977 movl(tmp1, 0x80808080);
6978 movdl(vec2, tmp1);
6979 pshufd(vec2, vec2, 0);
6980
6981 bind(COMPARE_WIDE_VECTORS);
6982 movdqu(vec1, Address(ary1, len, Address::times_1));
6983 ptest(vec1, vec2);
6984 jcc(Assembler::notZero, TRUE_LABEL);
6985 addptr(len, 16);
6986 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6987
6988 testl(result, result);
6989 jcc(Assembler::zero, FALSE_LABEL);
6990
6991 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6992 ptest(vec1, vec2);
6993 jccb(Assembler::notZero, TRUE_LABEL);
6994 jmpb(FALSE_LABEL);
6995
6996 bind(COMPARE_TAIL); // len is zero
6997 movl(len, result);
6998 // Fallthru to tail compare
6999 }
7000 }
7001 // Compare 4-byte vectors
7002 andl(len, 0xfffffffc); // vector count (in bytes)
7003 jccb(Assembler::zero, COMPARE_CHAR);
7004
7005 lea(ary1, Address(ary1, len, Address::times_1));
7006 negptr(len);
7007
7008 bind(COMPARE_VECTORS);
7009 movl(tmp1, Address(ary1, len, Address::times_1));
7010 andl(tmp1, 0x80808080);
7011 jccb(Assembler::notZero, TRUE_LABEL);
7012 addptr(len, 4);
7013 jcc(Assembler::notZero, COMPARE_VECTORS);
7014
7015 // Compare trailing char (final 2 bytes), if any
7016 bind(COMPARE_CHAR);
7017 testl(result, 0x2); // tail char
7018 jccb(Assembler::zero, COMPARE_BYTE);
7019 load_unsigned_short(tmp1, Address(ary1, 0));
7020 andl(tmp1, 0x00008080);
7021 jccb(Assembler::notZero, TRUE_LABEL);
7022 subptr(result, 2);
7023 lea(ary1, Address(ary1, 2));
7024
7025 bind(COMPARE_BYTE);
7026 testl(result, 0x1); // tail byte
7027 jccb(Assembler::zero, FALSE_LABEL);
7028 load_unsigned_byte(tmp1, Address(ary1, 0));
7029 andl(tmp1, 0x00000080);
7030 jccb(Assembler::notEqual, TRUE_LABEL);
7031 jmpb(FALSE_LABEL);
7032
7033 bind(TRUE_LABEL);
7034 movl(result, 1); // return true
7035 jmpb(DONE);
7036
7037 bind(FALSE_LABEL);
7038 xorl(result, result); // return false
7039
7040 // That's it
7041 bind(DONE);
7042 if (UseAVX >= 2 && UseSSE >= 2) {
7043 // clean upper bits of YMM registers
7044 vpxor(vec1, vec1);
7045 vpxor(vec2, vec2);
7046 }
7047}
7048// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7049void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7050 Register limit, Register result, Register chr,
7051 XMMRegister vec1, XMMRegister vec2, bool is_char) {
7052 ShortBranchVerifier sbv(this);
7053 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7054
7055 int length_offset = arrayOopDesc::length_offset_in_bytes();
7056 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7057
7058 if (is_array_equ) {
7059 // Check the input args
7060 cmpoop(ary1, ary2);
7061 jcc(Assembler::equal, TRUE_LABEL);
7062
7063 // Need additional checks for arrays_equals.
7064 testptr(ary1, ary1);
7065 jcc(Assembler::zero, FALSE_LABEL);
7066 testptr(ary2, ary2);
7067 jcc(Assembler::zero, FALSE_LABEL);
7068
7069 // Check the lengths
7070 movl(limit, Address(ary1, length_offset));
7071 cmpl(limit, Address(ary2, length_offset));
7072 jcc(Assembler::notEqual, FALSE_LABEL);
7073 }
7074
7075 // count == 0
7076 testl(limit, limit);
7077 jcc(Assembler::zero, TRUE_LABEL);
7078
7079 if (is_array_equ) {
7080 // Load array address
7081 lea(ary1, Address(ary1, base_offset));
7082 lea(ary2, Address(ary2, base_offset));
7083 }
7084
7085 if (is_array_equ && is_char) {
7086 // arrays_equals when used for char[].
7087 shll(limit, 1); // byte count != 0
7088 }
7089 movl(result, limit); // copy
7090
7091 if (UseAVX >= 2) {
7092 // With AVX2, use 32-byte vector compare
7093 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7094
7095 // Compare 32-byte vectors
7096 andl(result, 0x0000001f); // tail count (in bytes)
7097 andl(limit, 0xffffffe0); // vector count (in bytes)
7098 jcc(Assembler::zero, COMPARE_TAIL);
7099
7100 lea(ary1, Address(ary1, limit, Address::times_1));
7101 lea(ary2, Address(ary2, limit, Address::times_1));
7102 negptr(limit);
7103
7104 bind(COMPARE_WIDE_VECTORS);
7105
7106#ifdef _LP64
7107 if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7108 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7109
7110 cmpl(limit, -64);
7111 jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7112
7113 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7114
7115 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7116 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7117 kortestql(k7, k7);
7118 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
7119 addptr(limit, 64); // update since we already compared at this addr
7120 cmpl(limit, -64);
7121 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7122
7123 // At this point we may still need to compare -limit+result bytes.
7124 // We could execute the next two instruction and just continue via non-wide path:
7125 // cmpl(limit, 0);
7126 // jcc(Assembler::equal, COMPARE_TAIL); // true
7127 // But since we stopped at the points ary{1,2}+limit which are
7128 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7129 // (|limit| <= 32 and result < 32),
7130 // we may just compare the last 64 bytes.
7131 //
7132 addptr(result, -64); // it is safe, bc we just came from this area
7133 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7134 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7135 kortestql(k7, k7);
7136 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
7137
7138 jmp(TRUE_LABEL);
7139
7140 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7141
7142 }//if (VM_Version::supports_avx512vlbw())
7143#endif //_LP64
7144
7145 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7146 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7147 vpxor(vec1, vec2);
7148
7149 vptest(vec1, vec1);
7150 jcc(Assembler::notZero, FALSE_LABEL);
7151 addptr(limit, 32);
7152 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7153
7154 testl(result, result);
7155 jcc(Assembler::zero, TRUE_LABEL);
7156
7157 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7158 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7159 vpxor(vec1, vec2);
7160
7161 vptest(vec1, vec1);
7162 jccb(Assembler::notZero, FALSE_LABEL);
7163 jmpb(TRUE_LABEL);
7164
7165 bind(COMPARE_TAIL); // limit is zero
7166 movl(limit, result);
7167 // Fallthru to tail compare
7168 } else if (UseSSE42Intrinsics) {
7169 // With SSE4.2, use double quad vector compare
7170 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7171
7172 // Compare 16-byte vectors
7173 andl(result, 0x0000000f); // tail count (in bytes)
7174 andl(limit, 0xfffffff0); // vector count (in bytes)
7175 jcc(Assembler::zero, COMPARE_TAIL);
7176
7177 lea(ary1, Address(ary1, limit, Address::times_1));
7178 lea(ary2, Address(ary2, limit, Address::times_1));
7179 negptr(limit);
7180
7181 bind(COMPARE_WIDE_VECTORS);
7182 movdqu(vec1, Address(ary1, limit, Address::times_1));
7183 movdqu(vec2, Address(ary2, limit, Address::times_1));
7184 pxor(vec1, vec2);
7185
7186 ptest(vec1, vec1);
7187 jcc(Assembler::notZero, FALSE_LABEL);
7188 addptr(limit, 16);
7189 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7190
7191 testl(result, result);
7192 jcc(Assembler::zero, TRUE_LABEL);
7193
7194 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7195 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7196 pxor(vec1, vec2);
7197
7198 ptest(vec1, vec1);
7199 jccb(Assembler::notZero, FALSE_LABEL);
7200 jmpb(TRUE_LABEL);
7201
7202 bind(COMPARE_TAIL); // limit is zero
7203 movl(limit, result);
7204 // Fallthru to tail compare
7205 }
7206
7207 // Compare 4-byte vectors
7208 andl(limit, 0xfffffffc); // vector count (in bytes)
7209 jccb(Assembler::zero, COMPARE_CHAR);
7210
7211 lea(ary1, Address(ary1, limit, Address::times_1));
7212 lea(ary2, Address(ary2, limit, Address::times_1));
7213 negptr(limit);
7214
7215 bind(COMPARE_VECTORS);
7216 movl(chr, Address(ary1, limit, Address::times_1));
7217 cmpl(chr, Address(ary2, limit, Address::times_1));
7218 jccb(Assembler::notEqual, FALSE_LABEL);
7219 addptr(limit, 4);
7220 jcc(Assembler::notZero, COMPARE_VECTORS);
7221
7222 // Compare trailing char (final 2 bytes), if any
7223 bind(COMPARE_CHAR);
7224 testl(result, 0x2); // tail char
7225 jccb(Assembler::zero, COMPARE_BYTE);
7226 load_unsigned_short(chr, Address(ary1, 0));
7227 load_unsigned_short(limit, Address(ary2, 0));
7228 cmpl(chr, limit);
7229 jccb(Assembler::notEqual, FALSE_LABEL);
7230
7231 if (is_array_equ && is_char) {
7232 bind(COMPARE_BYTE);
7233 } else {
7234 lea(ary1, Address(ary1, 2));
7235 lea(ary2, Address(ary2, 2));
7236
7237 bind(COMPARE_BYTE);
7238 testl(result, 0x1); // tail byte
7239 jccb(Assembler::zero, TRUE_LABEL);
7240 load_unsigned_byte(chr, Address(ary1, 0));
7241 load_unsigned_byte(limit, Address(ary2, 0));
7242 cmpl(chr, limit);
7243 jccb(Assembler::notEqual, FALSE_LABEL);
7244 }
7245 bind(TRUE_LABEL);
7246 movl(result, 1); // return true
7247 jmpb(DONE);
7248
7249 bind(FALSE_LABEL);
7250 xorl(result, result); // return false
7251
7252 // That's it
7253 bind(DONE);
7254 if (UseAVX >= 2) {
7255 // clean upper bits of YMM registers
7256 vpxor(vec1, vec1);
7257 vpxor(vec2, vec2);
7258 }
7259}
7260
7261#endif
7262
7263void MacroAssembler::generate_fill(BasicType t, bool aligned,
7264 Register to, Register value, Register count,
7265 Register rtmp, XMMRegister xtmp) {
7266 ShortBranchVerifier sbv(this);
7267 assert_different_registers(to, value, count, rtmp);
7268 Label L_exit;
7269 Label L_fill_2_bytes, L_fill_4_bytes;
7270
7271 int shift = -1;
7272 switch (t) {
7273 case T_BYTE:
7274 shift = 2;
7275 break;
7276 case T_SHORT:
7277 shift = 1;
7278 break;
7279 case T_INT:
7280 shift = 0;
7281 break;
7282 default: ShouldNotReachHere();
7283 }
7284
7285 if (t == T_BYTE) {
7286 andl(value, 0xff);
7287 movl(rtmp, value);
7288 shll(rtmp, 8);
7289 orl(value, rtmp);
7290 }
7291 if (t == T_SHORT) {
7292 andl(value, 0xffff);
7293 }
7294 if (t == T_BYTE || t == T_SHORT) {
7295 movl(rtmp, value);
7296 shll(rtmp, 16);
7297 orl(value, rtmp);
7298 }
7299
7300 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7301 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7302 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7303 Label L_skip_align2;
7304 // align source address at 4 bytes address boundary
7305 if (t == T_BYTE) {
7306 Label L_skip_align1;
7307 // One byte misalignment happens only for byte arrays
7308 testptr(to, 1);
7309 jccb(Assembler::zero, L_skip_align1);
7310 movb(Address(to, 0), value);
7311 increment(to);
7312 decrement(count);
7313 BIND(L_skip_align1);
7314 }
7315 // Two bytes misalignment happens only for byte and short (char) arrays
7316 testptr(to, 2);
7317 jccb(Assembler::zero, L_skip_align2);
7318 movw(Address(to, 0), value);
7319 addptr(to, 2);
7320 subl(count, 1<<(shift-1));
7321 BIND(L_skip_align2);
7322 }
7323 if (UseSSE < 2) {
7324 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7325 // Fill 32-byte chunks
7326 subl(count, 8 << shift);
7327 jcc(Assembler::less, L_check_fill_8_bytes);
7328 align(16);
7329
7330 BIND(L_fill_32_bytes_loop);
7331
7332 for (int i = 0; i < 32; i += 4) {
7333 movl(Address(to, i), value);
7334 }
7335
7336 addptr(to, 32);
7337 subl(count, 8 << shift);
7338 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7339 BIND(L_check_fill_8_bytes);
7340 addl(count, 8 << shift);
7341 jccb(Assembler::zero, L_exit);
7342 jmpb(L_fill_8_bytes);
7343
7344 //
7345 // length is too short, just fill qwords
7346 //
7347 BIND(L_fill_8_bytes_loop);
7348 movl(Address(to, 0), value);
7349 movl(Address(to, 4), value);
7350 addptr(to, 8);
7351 BIND(L_fill_8_bytes);
7352 subl(count, 1 << (shift + 1));
7353 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7354 // fall through to fill 4 bytes
7355 } else {
7356 Label L_fill_32_bytes;
7357 if (!UseUnalignedLoadStores) {
7358 // align to 8 bytes, we know we are 4 byte aligned to start
7359 testptr(to, 4);
7360 jccb(Assembler::zero, L_fill_32_bytes);
7361 movl(Address(to, 0), value);
7362 addptr(to, 4);
7363 subl(count, 1<<shift);
7364 }
7365 BIND(L_fill_32_bytes);
7366 {
7367 assert( UseSSE >= 2, "supported cpu only" );
7368 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7369 movdl(xtmp, value);
7370 if (UseAVX > 2 && UseUnalignedLoadStores) {
7371 // Fill 64-byte chunks
7372 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7373 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7374
7375 subl(count, 16 << shift);
7376 jcc(Assembler::less, L_check_fill_32_bytes);
7377 align(16);
7378
7379 BIND(L_fill_64_bytes_loop);
7380 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7381 addptr(to, 64);
7382 subl(count, 16 << shift);
7383 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7384
7385 BIND(L_check_fill_32_bytes);
7386 addl(count, 8 << shift);
7387 jccb(Assembler::less, L_check_fill_8_bytes);
7388 vmovdqu(Address(to, 0), xtmp);
7389 addptr(to, 32);
7390 subl(count, 8 << shift);
7391
7392 BIND(L_check_fill_8_bytes);
7393 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7394 // Fill 64-byte chunks
7395 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7396 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7397
7398 subl(count, 16 << shift);
7399 jcc(Assembler::less, L_check_fill_32_bytes);
7400 align(16);
7401
7402 BIND(L_fill_64_bytes_loop);
7403 vmovdqu(Address(to, 0), xtmp);
7404 vmovdqu(Address(to, 32), xtmp);
7405 addptr(to, 64);
7406 subl(count, 16 << shift);
7407 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7408
7409 BIND(L_check_fill_32_bytes);
7410 addl(count, 8 << shift);
7411 jccb(Assembler::less, L_check_fill_8_bytes);
7412 vmovdqu(Address(to, 0), xtmp);
7413 addptr(to, 32);
7414 subl(count, 8 << shift);
7415
7416 BIND(L_check_fill_8_bytes);
7417 // clean upper bits of YMM registers
7418 movdl(xtmp, value);
7419 pshufd(xtmp, xtmp, 0);
7420 } else {
7421 // Fill 32-byte chunks
7422 pshufd(xtmp, xtmp, 0);
7423
7424 subl(count, 8 << shift);
7425 jcc(Assembler::less, L_check_fill_8_bytes);
7426 align(16);
7427
7428 BIND(L_fill_32_bytes_loop);
7429
7430 if (UseUnalignedLoadStores) {
7431 movdqu(Address(to, 0), xtmp);
7432 movdqu(Address(to, 16), xtmp);
7433 } else {
7434 movq(Address(to, 0), xtmp);
7435 movq(Address(to, 8), xtmp);
7436 movq(Address(to, 16), xtmp);
7437 movq(Address(to, 24), xtmp);
7438 }
7439
7440 addptr(to, 32);
7441 subl(count, 8 << shift);
7442 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7443
7444 BIND(L_check_fill_8_bytes);
7445 }
7446 addl(count, 8 << shift);
7447 jccb(Assembler::zero, L_exit);
7448 jmpb(L_fill_8_bytes);
7449
7450 //
7451 // length is too short, just fill qwords
7452 //
7453 BIND(L_fill_8_bytes_loop);
7454 movq(Address(to, 0), xtmp);
7455 addptr(to, 8);
7456 BIND(L_fill_8_bytes);
7457 subl(count, 1 << (shift + 1));
7458 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7459 }
7460 }
7461 // fill trailing 4 bytes
7462 BIND(L_fill_4_bytes);
7463 testl(count, 1<<shift);
7464 jccb(Assembler::zero, L_fill_2_bytes);
7465 movl(Address(to, 0), value);
7466 if (t == T_BYTE || t == T_SHORT) {
7467 Label L_fill_byte;
7468 addptr(to, 4);
7469 BIND(L_fill_2_bytes);
7470 // fill trailing 2 bytes
7471 testl(count, 1<<(shift-1));
7472 jccb(Assembler::zero, L_fill_byte);
7473 movw(Address(to, 0), value);
7474 if (t == T_BYTE) {
7475 addptr(to, 2);
7476 BIND(L_fill_byte);
7477 // fill trailing byte
7478 testl(count, 1);
7479 jccb(Assembler::zero, L_exit);
7480 movb(Address(to, 0), value);
7481 } else {
7482 BIND(L_fill_byte);
7483 }
7484 } else {
7485 BIND(L_fill_2_bytes);
7486 }
7487 BIND(L_exit);
7488}
7489
7490// encode char[] to byte[] in ISO_8859_1
7491 //@HotSpotIntrinsicCandidate
7492 //private static int implEncodeISOArray(byte[] sa, int sp,
7493 //byte[] da, int dp, int len) {
7494 // int i = 0;
7495 // for (; i < len; i++) {
7496 // char c = StringUTF16.getChar(sa, sp++);
7497 // if (c > '\u00FF')
7498 // break;
7499 // da[dp++] = (byte)c;
7500 // }
7501 // return i;
7502 //}
7503void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7504 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7505 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7506 Register tmp5, Register result) {
7507
7508 // rsi: src
7509 // rdi: dst
7510 // rdx: len
7511 // rcx: tmp5
7512 // rax: result
7513 ShortBranchVerifier sbv(this);
7514 assert_different_registers(src, dst, len, tmp5, result);
7515 Label L_done, L_copy_1_char, L_copy_1_char_exit;
7516
7517 // set result
7518 xorl(result, result);
7519 // check for zero length
7520 testl(len, len);
7521 jcc(Assembler::zero, L_done);
7522
7523 movl(result, len);
7524
7525 // Setup pointers
7526 lea(src, Address(src, len, Address::times_2)); // char[]
7527 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7528 negptr(len);
7529
7530 if (UseSSE42Intrinsics || UseAVX >= 2) {
7531 Label L_copy_8_chars, L_copy_8_chars_exit;
7532 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7533
7534 if (UseAVX >= 2) {
7535 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7536 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7537 movdl(tmp1Reg, tmp5);
7538 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7539 jmp(L_chars_32_check);
7540
7541 bind(L_copy_32_chars);
7542 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7543 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7544 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7545 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7546 jccb(Assembler::notZero, L_copy_32_chars_exit);
7547 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7548 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7549 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7550
7551 bind(L_chars_32_check);
7552 addptr(len, 32);
7553 jcc(Assembler::lessEqual, L_copy_32_chars);
7554
7555 bind(L_copy_32_chars_exit);
7556 subptr(len, 16);
7557 jccb(Assembler::greater, L_copy_16_chars_exit);
7558
7559 } else if (UseSSE42Intrinsics) {
7560 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7561 movdl(tmp1Reg, tmp5);
7562 pshufd(tmp1Reg, tmp1Reg, 0);
7563 jmpb(L_chars_16_check);
7564 }
7565
7566 bind(L_copy_16_chars);
7567 if (UseAVX >= 2) {
7568 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7569 vptest(tmp2Reg, tmp1Reg);
7570 jcc(Assembler::notZero, L_copy_16_chars_exit);
7571 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7572 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7573 } else {
7574 if (UseAVX > 0) {
7575 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7576 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7577 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7578 } else {
7579 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7580 por(tmp2Reg, tmp3Reg);
7581 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7582 por(tmp2Reg, tmp4Reg);
7583 }
7584 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7585 jccb(Assembler::notZero, L_copy_16_chars_exit);
7586 packuswb(tmp3Reg, tmp4Reg);
7587 }
7588 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7589
7590 bind(L_chars_16_check);
7591 addptr(len, 16);
7592 jcc(Assembler::lessEqual, L_copy_16_chars);
7593
7594 bind(L_copy_16_chars_exit);
7595 if (UseAVX >= 2) {
7596 // clean upper bits of YMM registers
7597 vpxor(tmp2Reg, tmp2Reg);
7598 vpxor(tmp3Reg, tmp3Reg);
7599 vpxor(tmp4Reg, tmp4Reg);
7600 movdl(tmp1Reg, tmp5);
7601 pshufd(tmp1Reg, tmp1Reg, 0);
7602 }
7603 subptr(len, 8);
7604 jccb(Assembler::greater, L_copy_8_chars_exit);
7605
7606 bind(L_copy_8_chars);
7607 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7608 ptest(tmp3Reg, tmp1Reg);
7609 jccb(Assembler::notZero, L_copy_8_chars_exit);
7610 packuswb(tmp3Reg, tmp1Reg);
7611 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7612 addptr(len, 8);
7613 jccb(Assembler::lessEqual, L_copy_8_chars);
7614
7615 bind(L_copy_8_chars_exit);
7616 subptr(len, 8);
7617 jccb(Assembler::zero, L_done);
7618 }
7619
7620 bind(L_copy_1_char);
7621 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7622 testl(tmp5, 0xff00); // check if Unicode char
7623 jccb(Assembler::notZero, L_copy_1_char_exit);
7624 movb(Address(dst, len, Address::times_1, 0), tmp5);
7625 addptr(len, 1);
7626 jccb(Assembler::less, L_copy_1_char);
7627
7628 bind(L_copy_1_char_exit);
7629 addptr(result, len); // len is negative count of not processed elements
7630
7631 bind(L_done);
7632}
7633
7634#ifdef _LP64
7635/**
7636 * Helper for multiply_to_len().
7637 */
7638void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7639 addq(dest_lo, src1);
7640 adcq(dest_hi, 0);
7641 addq(dest_lo, src2);
7642 adcq(dest_hi, 0);
7643}
7644
7645/**
7646 * Multiply 64 bit by 64 bit first loop.
7647 */
7648void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7649 Register y, Register y_idx, Register z,
7650 Register carry, Register product,
7651 Register idx, Register kdx) {
7652 //
7653 // jlong carry, x[], y[], z[];
7654 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7655 // huge_128 product = y[idx] * x[xstart] + carry;
7656 // z[kdx] = (jlong)product;
7657 // carry = (jlong)(product >>> 64);
7658 // }
7659 // z[xstart] = carry;
7660 //
7661
7662 Label L_first_loop, L_first_loop_exit;
7663 Label L_one_x, L_one_y, L_multiply;
7664
7665 decrementl(xstart);
7666 jcc(Assembler::negative, L_one_x);
7667
7668 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7669 rorq(x_xstart, 32); // convert big-endian to little-endian
7670
7671 bind(L_first_loop);
7672 decrementl(idx);
7673 jcc(Assembler::negative, L_first_loop_exit);
7674 decrementl(idx);
7675 jcc(Assembler::negative, L_one_y);
7676 movq(y_idx, Address(y, idx, Address::times_4, 0));
7677 rorq(y_idx, 32); // convert big-endian to little-endian
7678 bind(L_multiply);
7679 movq(product, x_xstart);
7680 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7681 addq(product, carry);
7682 adcq(rdx, 0);
7683 subl(kdx, 2);
7684 movl(Address(z, kdx, Address::times_4, 4), product);
7685 shrq(product, 32);
7686 movl(Address(z, kdx, Address::times_4, 0), product);
7687 movq(carry, rdx);
7688 jmp(L_first_loop);
7689
7690 bind(L_one_y);
7691 movl(y_idx, Address(y, 0));
7692 jmp(L_multiply);
7693
7694 bind(L_one_x);
7695 movl(x_xstart, Address(x, 0));
7696 jmp(L_first_loop);
7697
7698 bind(L_first_loop_exit);
7699}
7700
7701/**
7702 * Multiply 64 bit by 64 bit and add 128 bit.
7703 */
7704void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7705 Register yz_idx, Register idx,
7706 Register carry, Register product, int offset) {
7707 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7708 // z[kdx] = (jlong)product;
7709
7710 movq(yz_idx, Address(y, idx, Address::times_4, offset));
7711 rorq(yz_idx, 32); // convert big-endian to little-endian
7712 movq(product, x_xstart);
7713 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7714 movq(yz_idx, Address(z, idx, Address::times_4, offset));
7715 rorq(yz_idx, 32); // convert big-endian to little-endian
7716
7717 add2_with_carry(rdx, product, carry, yz_idx);
7718
7719 movl(Address(z, idx, Address::times_4, offset+4), product);
7720 shrq(product, 32);
7721 movl(Address(z, idx, Address::times_4, offset), product);
7722
7723}
7724
7725/**
7726 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7727 */
7728void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7729 Register yz_idx, Register idx, Register jdx,
7730 Register carry, Register product,
7731 Register carry2) {
7732 // jlong carry, x[], y[], z[];
7733 // int kdx = ystart+1;
7734 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7735 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7736 // z[kdx+idx+1] = (jlong)product;
7737 // jlong carry2 = (jlong)(product >>> 64);
7738 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7739 // z[kdx+idx] = (jlong)product;
7740 // carry = (jlong)(product >>> 64);
7741 // }
7742 // idx += 2;
7743 // if (idx > 0) {
7744 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7745 // z[kdx+idx] = (jlong)product;
7746 // carry = (jlong)(product >>> 64);
7747 // }
7748 //
7749
7750 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7751
7752 movl(jdx, idx);
7753 andl(jdx, 0xFFFFFFFC);
7754 shrl(jdx, 2);
7755
7756 bind(L_third_loop);
7757 subl(jdx, 1);
7758 jcc(Assembler::negative, L_third_loop_exit);
7759 subl(idx, 4);
7760
7761 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7762 movq(carry2, rdx);
7763
7764 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7765 movq(carry, rdx);
7766 jmp(L_third_loop);
7767
7768 bind (L_third_loop_exit);
7769
7770 andl (idx, 0x3);
7771 jcc(Assembler::zero, L_post_third_loop_done);
7772
7773 Label L_check_1;
7774 subl(idx, 2);
7775 jcc(Assembler::negative, L_check_1);
7776
7777 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7778 movq(carry, rdx);
7779
7780 bind (L_check_1);
7781 addl (idx, 0x2);
7782 andl (idx, 0x1);
7783 subl(idx, 1);
7784 jcc(Assembler::negative, L_post_third_loop_done);
7785
7786 movl(yz_idx, Address(y, idx, Address::times_4, 0));
7787 movq(product, x_xstart);
7788 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7789 movl(yz_idx, Address(z, idx, Address::times_4, 0));
7790
7791 add2_with_carry(rdx, product, yz_idx, carry);
7792
7793 movl(Address(z, idx, Address::times_4, 0), product);
7794 shrq(product, 32);
7795
7796 shlq(rdx, 32);
7797 orq(product, rdx);
7798 movq(carry, product);
7799
7800 bind(L_post_third_loop_done);
7801}
7802
7803/**
7804 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7805 *
7806 */
7807void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7808 Register carry, Register carry2,
7809 Register idx, Register jdx,
7810 Register yz_idx1, Register yz_idx2,
7811 Register tmp, Register tmp3, Register tmp4) {
7812 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7813
7814 // jlong carry, x[], y[], z[];
7815 // int kdx = ystart+1;
7816 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7817 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7818 // jlong carry2 = (jlong)(tmp3 >>> 64);
7819 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
7820 // carry = (jlong)(tmp4 >>> 64);
7821 // z[kdx+idx+1] = (jlong)tmp3;
7822 // z[kdx+idx] = (jlong)tmp4;
7823 // }
7824 // idx += 2;
7825 // if (idx > 0) {
7826 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7827 // z[kdx+idx] = (jlong)yz_idx1;
7828 // carry = (jlong)(yz_idx1 >>> 64);
7829 // }
7830 //
7831
7832 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7833
7834 movl(jdx, idx);
7835 andl(jdx, 0xFFFFFFFC);
7836 shrl(jdx, 2);
7837
7838 bind(L_third_loop);
7839 subl(jdx, 1);
7840 jcc(Assembler::negative, L_third_loop_exit);
7841 subl(idx, 4);
7842
7843 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
7844 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7845 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
7846 rorxq(yz_idx2, yz_idx2, 32);
7847
7848 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7849 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
7850
7851 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
7852 rorxq(yz_idx1, yz_idx1, 32);
7853 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7854 rorxq(yz_idx2, yz_idx2, 32);
7855
7856 if (VM_Version::supports_adx()) {
7857 adcxq(tmp3, carry);
7858 adoxq(tmp3, yz_idx1);
7859
7860 adcxq(tmp4, tmp);
7861 adoxq(tmp4, yz_idx2);
7862
7863 movl(carry, 0); // does not affect flags
7864 adcxq(carry2, carry);
7865 adoxq(carry2, carry);
7866 } else {
7867 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7868 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7869 }
7870 movq(carry, carry2);
7871
7872 movl(Address(z, idx, Address::times_4, 12), tmp3);
7873 shrq(tmp3, 32);
7874 movl(Address(z, idx, Address::times_4, 8), tmp3);
7875
7876 movl(Address(z, idx, Address::times_4, 4), tmp4);
7877 shrq(tmp4, 32);
7878 movl(Address(z, idx, Address::times_4, 0), tmp4);
7879
7880 jmp(L_third_loop);
7881
7882 bind (L_third_loop_exit);
7883
7884 andl (idx, 0x3);
7885 jcc(Assembler::zero, L_post_third_loop_done);
7886
7887 Label L_check_1;
7888 subl(idx, 2);
7889 jcc(Assembler::negative, L_check_1);
7890
7891 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
7892 rorxq(yz_idx1, yz_idx1, 32);
7893 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7894 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7895 rorxq(yz_idx2, yz_idx2, 32);
7896
7897 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7898
7899 movl(Address(z, idx, Address::times_4, 4), tmp3);
7900 shrq(tmp3, 32);
7901 movl(Address(z, idx, Address::times_4, 0), tmp3);
7902 movq(carry, tmp4);
7903
7904 bind (L_check_1);
7905 addl (idx, 0x2);
7906 andl (idx, 0x1);
7907 subl(idx, 1);
7908 jcc(Assembler::negative, L_post_third_loop_done);
7909 movl(tmp4, Address(y, idx, Address::times_4, 0));
7910 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
7911 movl(tmp4, Address(z, idx, Address::times_4, 0));
7912
7913 add2_with_carry(carry2, tmp3, tmp4, carry);
7914
7915 movl(Address(z, idx, Address::times_4, 0), tmp3);
7916 shrq(tmp3, 32);
7917
7918 shlq(carry2, 32);
7919 orq(tmp3, carry2);
7920 movq(carry, tmp3);
7921
7922 bind(L_post_third_loop_done);
7923}
7924
7925/**
7926 * Code for BigInteger::multiplyToLen() instrinsic.
7927 *
7928 * rdi: x
7929 * rax: xlen
7930 * rsi: y
7931 * rcx: ylen
7932 * r8: z
7933 * r11: zlen
7934 * r12: tmp1
7935 * r13: tmp2
7936 * r14: tmp3
7937 * r15: tmp4
7938 * rbx: tmp5
7939 *
7940 */
7941void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7942 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7943 ShortBranchVerifier sbv(this);
7944 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7945
7946 push(tmp1);
7947 push(tmp2);
7948 push(tmp3);
7949 push(tmp4);
7950 push(tmp5);
7951
7952 push(xlen);
7953 push(zlen);
7954
7955 const Register idx = tmp1;
7956 const Register kdx = tmp2;
7957 const Register xstart = tmp3;
7958
7959 const Register y_idx = tmp4;
7960 const Register carry = tmp5;
7961 const Register product = xlen;
7962 const Register x_xstart = zlen; // reuse register
7963
7964 // First Loop.
7965 //
7966 // final static long LONG_MASK = 0xffffffffL;
7967 // int xstart = xlen - 1;
7968 // int ystart = ylen - 1;
7969 // long carry = 0;
7970 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7971 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7972 // z[kdx] = (int)product;
7973 // carry = product >>> 32;
7974 // }
7975 // z[xstart] = (int)carry;
7976 //
7977
7978 movl(idx, ylen); // idx = ylen;
7979 movl(kdx, zlen); // kdx = xlen+ylen;
7980 xorq(carry, carry); // carry = 0;
7981
7982 Label L_done;
7983
7984 movl(xstart, xlen);
7985 decrementl(xstart);
7986 jcc(Assembler::negative, L_done);
7987
7988 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7989
7990 Label L_second_loop;
7991 testl(kdx, kdx);
7992 jcc(Assembler::zero, L_second_loop);
7993
7994 Label L_carry;
7995 subl(kdx, 1);
7996 jcc(Assembler::zero, L_carry);
7997
7998 movl(Address(z, kdx, Address::times_4, 0), carry);
7999 shrq(carry, 32);
8000 subl(kdx, 1);
8001
8002 bind(L_carry);
8003 movl(Address(z, kdx, Address::times_4, 0), carry);
8004
8005 // Second and third (nested) loops.
8006 //
8007 // for (int i = xstart-1; i >= 0; i--) { // Second loop
8008 // carry = 0;
8009 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8010 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8011 // (z[k] & LONG_MASK) + carry;
8012 // z[k] = (int)product;
8013 // carry = product >>> 32;
8014 // }
8015 // z[i] = (int)carry;
8016 // }
8017 //
8018 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8019
8020 const Register jdx = tmp1;
8021
8022 bind(L_second_loop);
8023 xorl(carry, carry); // carry = 0;
8024 movl(jdx, ylen); // j = ystart+1
8025
8026 subl(xstart, 1); // i = xstart-1;
8027 jcc(Assembler::negative, L_done);
8028
8029 push (z);
8030
8031 Label L_last_x;
8032 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8033 subl(xstart, 1); // i = xstart-1;
8034 jcc(Assembler::negative, L_last_x);
8035
8036 if (UseBMI2Instructions) {
8037 movq(rdx, Address(x, xstart, Address::times_4, 0));
8038 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8039 } else {
8040 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
8041 rorq(x_xstart, 32); // convert big-endian to little-endian
8042 }
8043
8044 Label L_third_loop_prologue;
8045 bind(L_third_loop_prologue);
8046
8047 push (x);
8048 push (xstart);
8049 push (ylen);
8050
8051
8052 if (UseBMI2Instructions) {
8053 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8054 } else { // !UseBMI2Instructions
8055 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8056 }
8057
8058 pop(ylen);
8059 pop(xlen);
8060 pop(x);
8061 pop(z);
8062
8063 movl(tmp3, xlen);
8064 addl(tmp3, 1);
8065 movl(Address(z, tmp3, Address::times_4, 0), carry);
8066 subl(tmp3, 1);
8067 jccb(Assembler::negative, L_done);
8068
8069 shrq(carry, 32);
8070 movl(Address(z, tmp3, Address::times_4, 0), carry);
8071 jmp(L_second_loop);
8072
8073 // Next infrequent code is moved outside loops.
8074 bind(L_last_x);
8075 if (UseBMI2Instructions) {
8076 movl(rdx, Address(x, 0));
8077 } else {
8078 movl(x_xstart, Address(x, 0));
8079 }
8080 jmp(L_third_loop_prologue);
8081
8082 bind(L_done);
8083
8084 pop(zlen);
8085 pop(xlen);
8086
8087 pop(tmp5);
8088 pop(tmp4);
8089 pop(tmp3);
8090 pop(tmp2);
8091 pop(tmp1);
8092}
8093
8094void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8095 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8096 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8097 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8098 Label VECTOR8_TAIL, VECTOR4_TAIL;
8099 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8100 Label SAME_TILL_END, DONE;
8101 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8102
8103 //scale is in rcx in both Win64 and Unix
8104 ShortBranchVerifier sbv(this);
8105
8106 shlq(length);
8107 xorq(result, result);
8108
8109 if ((UseAVX > 2) &&
8110 VM_Version::supports_avx512vlbw()) {
8111 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8112
8113 cmpq(length, 64);
8114 jcc(Assembler::less, VECTOR32_TAIL);
8115 movq(tmp1, length);
8116 andq(tmp1, 0x3F); // tail count
8117 andq(length, ~(0x3F)); //vector count
8118
8119 bind(VECTOR64_LOOP);
8120 // AVX512 code to compare 64 byte vectors.
8121 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8122 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8123 kortestql(k7, k7);
8124 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
8125 addq(result, 64);
8126 subq(length, 64);
8127 jccb(Assembler::notZero, VECTOR64_LOOP);
8128
8129 //bind(VECTOR64_TAIL);
8130 testq(tmp1, tmp1);
8131 jcc(Assembler::zero, SAME_TILL_END);
8132
8133 //bind(VECTOR64_TAIL);
8134 // AVX512 code to compare upto 63 byte vectors.
8135 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8136 shlxq(tmp2, tmp2, tmp1);
8137 notq(tmp2);
8138 kmovql(k3, tmp2);
8139
8140 evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8141 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8142
8143 ktestql(k7, k3);
8144 jcc(Assembler::below, SAME_TILL_END); // not mismatch
8145
8146 bind(VECTOR64_NOT_EQUAL);
8147 kmovql(tmp1, k7);
8148 notq(tmp1);
8149 tzcntq(tmp1, tmp1);
8150 addq(result, tmp1);
8151 shrq(result);
8152 jmp(DONE);
8153 bind(VECTOR32_TAIL);
8154 }
8155
8156 cmpq(length, 8);
8157 jcc(Assembler::equal, VECTOR8_LOOP);
8158 jcc(Assembler::less, VECTOR4_TAIL);
8159
8160 if (UseAVX >= 2) {
8161 Label VECTOR16_TAIL, VECTOR32_LOOP;
8162
8163 cmpq(length, 16);
8164 jcc(Assembler::equal, VECTOR16_LOOP);
8165 jcc(Assembler::less, VECTOR8_LOOP);
8166
8167 cmpq(length, 32);
8168 jccb(Assembler::less, VECTOR16_TAIL);
8169
8170 subq(length, 32);
8171 bind(VECTOR32_LOOP);
8172 vmovdqu(rymm0, Address(obja, result));
8173 vmovdqu(rymm1, Address(objb, result));
8174 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8175 vptest(rymm2, rymm2);
8176 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8177 addq(result, 32);
8178 subq(length, 32);
8179 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8180 addq(length, 32);
8181 jcc(Assembler::equal, SAME_TILL_END);
8182 //falling through if less than 32 bytes left //close the branch here.
8183
8184 bind(VECTOR16_TAIL);
8185 cmpq(length, 16);
8186 jccb(Assembler::less, VECTOR8_TAIL);
8187 bind(VECTOR16_LOOP);
8188 movdqu(rymm0, Address(obja, result));
8189 movdqu(rymm1, Address(objb, result));
8190 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8191 ptest(rymm2, rymm2);
8192 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8193 addq(result, 16);
8194 subq(length, 16);
8195 jcc(Assembler::equal, SAME_TILL_END);
8196 //falling through if less than 16 bytes left
8197 } else {//regular intrinsics
8198
8199 cmpq(length, 16);
8200 jccb(Assembler::less, VECTOR8_TAIL);
8201
8202 subq(length, 16);
8203 bind(VECTOR16_LOOP);
8204 movdqu(rymm0, Address(obja, result));
8205 movdqu(rymm1, Address(objb, result));
8206 pxor(rymm0, rymm1);
8207 ptest(rymm0, rymm0);
8208 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8209 addq(result, 16);
8210 subq(length, 16);
8211 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8212 addq(length, 16);
8213 jcc(Assembler::equal, SAME_TILL_END);
8214 //falling through if less than 16 bytes left
8215 }
8216
8217 bind(VECTOR8_TAIL);
8218 cmpq(length, 8);
8219 jccb(Assembler::less, VECTOR4_TAIL);
8220 bind(VECTOR8_LOOP);
8221 movq(tmp1, Address(obja, result));
8222 movq(tmp2, Address(objb, result));
8223 xorq(tmp1, tmp2);
8224 testq(tmp1, tmp1);
8225 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8226 addq(result, 8);
8227 subq(length, 8);
8228 jcc(Assembler::equal, SAME_TILL_END);
8229 //falling through if less than 8 bytes left
8230
8231 bind(VECTOR4_TAIL);
8232 cmpq(length, 4);
8233 jccb(Assembler::less, BYTES_TAIL);
8234 bind(VECTOR4_LOOP);
8235 movl(tmp1, Address(obja, result));
8236 xorl(tmp1, Address(objb, result));
8237 testl(tmp1, tmp1);
8238 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8239 addq(result, 4);
8240 subq(length, 4);
8241 jcc(Assembler::equal, SAME_TILL_END);
8242 //falling through if less than 4 bytes left
8243
8244 bind(BYTES_TAIL);
8245 bind(BYTES_LOOP);
8246 load_unsigned_byte(tmp1, Address(obja, result));
8247 load_unsigned_byte(tmp2, Address(objb, result));
8248 xorl(tmp1, tmp2);
8249 testl(tmp1, tmp1);
8250 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8251 decq(length);
8252 jcc(Assembler::zero, SAME_TILL_END);
8253 incq(result);
8254 load_unsigned_byte(tmp1, Address(obja, result));
8255 load_unsigned_byte(tmp2, Address(objb, result));
8256 xorl(tmp1, tmp2);
8257 testl(tmp1, tmp1);
8258 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8259 decq(length);
8260 jcc(Assembler::zero, SAME_TILL_END);
8261 incq(result);
8262 load_unsigned_byte(tmp1, Address(obja, result));
8263 load_unsigned_byte(tmp2, Address(objb, result));
8264 xorl(tmp1, tmp2);
8265 testl(tmp1, tmp1);
8266 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8267 jmp(SAME_TILL_END);
8268
8269 if (UseAVX >= 2) {
8270 bind(VECTOR32_NOT_EQUAL);
8271 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8272 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8273 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8274 vpmovmskb(tmp1, rymm0);
8275 bsfq(tmp1, tmp1);
8276 addq(result, tmp1);
8277 shrq(result);
8278 jmp(DONE);
8279 }
8280
8281 bind(VECTOR16_NOT_EQUAL);
8282 if (UseAVX >= 2) {
8283 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8284 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8285 pxor(rymm0, rymm2);
8286 } else {
8287 pcmpeqb(rymm2, rymm2);
8288 pxor(rymm0, rymm1);
8289 pcmpeqb(rymm0, rymm1);
8290 pxor(rymm0, rymm2);
8291 }
8292 pmovmskb(tmp1, rymm0);
8293 bsfq(tmp1, tmp1);
8294 addq(result, tmp1);
8295 shrq(result);
8296 jmpb(DONE);
8297
8298 bind(VECTOR8_NOT_EQUAL);
8299 bind(VECTOR4_NOT_EQUAL);
8300 bsfq(tmp1, tmp1);
8301 shrq(tmp1, 3);
8302 addq(result, tmp1);
8303 bind(BYTES_NOT_EQUAL);
8304 shrq(result);
8305 jmpb(DONE);
8306
8307 bind(SAME_TILL_END);
8308 mov64(result, -1);
8309
8310 bind(DONE);
8311}
8312
8313//Helper functions for square_to_len()
8314
8315/**
8316 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8317 * Preserves x and z and modifies rest of the registers.
8318 */
8319void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8320 // Perform square and right shift by 1
8321 // Handle odd xlen case first, then for even xlen do the following
8322 // jlong carry = 0;
8323 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8324 // huge_128 product = x[j:j+1] * x[j:j+1];
8325 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8326 // z[i+2:i+3] = (jlong)(product >>> 1);
8327 // carry = (jlong)product;
8328 // }
8329
8330 xorq(tmp5, tmp5); // carry
8331 xorq(rdxReg, rdxReg);
8332 xorl(tmp1, tmp1); // index for x
8333 xorl(tmp4, tmp4); // index for z
8334
8335 Label L_first_loop, L_first_loop_exit;
8336
8337 testl(xlen, 1);
8338 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8339
8340 // Square and right shift by 1 the odd element using 32 bit multiply
8341 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8342 imulq(raxReg, raxReg);
8343 shrq(raxReg, 1);
8344 adcq(tmp5, 0);
8345 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8346 incrementl(tmp1);
8347 addl(tmp4, 2);
8348
8349 // Square and right shift by 1 the rest using 64 bit multiply
8350 bind(L_first_loop);
8351 cmpptr(tmp1, xlen);
8352 jccb(Assembler::equal, L_first_loop_exit);
8353
8354 // Square
8355 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
8356 rorq(raxReg, 32); // convert big-endian to little-endian
8357 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
8358
8359 // Right shift by 1 and save carry
8360 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8361 rcrq(rdxReg, 1);
8362 rcrq(raxReg, 1);
8363 adcq(tmp5, 0);
8364
8365 // Store result in z
8366 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8367 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8368
8369 // Update indices for x and z
8370 addl(tmp1, 2);
8371 addl(tmp4, 4);
8372 jmp(L_first_loop);
8373
8374 bind(L_first_loop_exit);
8375}
8376
8377
8378/**
8379 * Perform the following multiply add operation using BMI2 instructions
8380 * carry:sum = sum + op1*op2 + carry
8381 * op2 should be in rdx
8382 * op2 is preserved, all other registers are modified
8383 */
8384void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8385 // assert op2 is rdx
8386 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
8387 addq(sum, carry);
8388 adcq(tmp2, 0);
8389 addq(sum, op1);
8390 adcq(tmp2, 0);
8391 movq(carry, tmp2);
8392}
8393
8394/**
8395 * Perform the following multiply add operation:
8396 * carry:sum = sum + op1*op2 + carry
8397 * Preserves op1, op2 and modifies rest of registers
8398 */
8399void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8400 // rdx:rax = op1 * op2
8401 movq(raxReg, op2);
8402 mulq(op1);
8403
8404 // rdx:rax = sum + carry + rdx:rax
8405 addq(sum, carry);
8406 adcq(rdxReg, 0);
8407 addq(sum, raxReg);
8408 adcq(rdxReg, 0);
8409
8410 // carry:sum = rdx:sum
8411 movq(carry, rdxReg);
8412}
8413
8414/**
8415 * Add 64 bit long carry into z[] with carry propogation.
8416 * Preserves z and carry register values and modifies rest of registers.
8417 *
8418 */
8419void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8420 Label L_fourth_loop, L_fourth_loop_exit;
8421
8422 movl(tmp1, 1);
8423 subl(zlen, 2);
8424 addq(Address(z, zlen, Address::times_4, 0), carry);
8425
8426 bind(L_fourth_loop);
8427 jccb(Assembler::carryClear, L_fourth_loop_exit);
8428 subl(zlen, 2);
8429 jccb(Assembler::negative, L_fourth_loop_exit);
8430 addq(Address(z, zlen, Address::times_4, 0), tmp1);
8431 jmp(L_fourth_loop);
8432 bind(L_fourth_loop_exit);
8433}
8434
8435/**
8436 * Shift z[] left by 1 bit.
8437 * Preserves x, len, z and zlen registers and modifies rest of the registers.
8438 *
8439 */
8440void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8441
8442 Label L_fifth_loop, L_fifth_loop_exit;
8443
8444 // Fifth loop
8445 // Perform primitiveLeftShift(z, zlen, 1)
8446
8447 const Register prev_carry = tmp1;
8448 const Register new_carry = tmp4;
8449 const Register value = tmp2;
8450 const Register zidx = tmp3;
8451
8452 // int zidx, carry;
8453 // long value;
8454 // carry = 0;
8455 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8456 // (carry:value) = (z[i] << 1) | carry ;
8457 // z[i] = value;
8458 // }
8459
8460 movl(zidx, zlen);
8461 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8462
8463 bind(L_fifth_loop);
8464 decl(zidx); // Use decl to preserve carry flag
8465 decl(zidx);
8466 jccb(Assembler::negative, L_fifth_loop_exit);
8467
8468 if (UseBMI2Instructions) {
8469 movq(value, Address(z, zidx, Address::times_4, 0));
8470 rclq(value, 1);
8471 rorxq(value, value, 32);
8472 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
8473 }
8474 else {
8475 // clear new_carry
8476 xorl(new_carry, new_carry);
8477
8478 // Shift z[i] by 1, or in previous carry and save new carry
8479 movq(value, Address(z, zidx, Address::times_4, 0));
8480 shlq(value, 1);
8481 adcl(new_carry, 0);
8482
8483 orq(value, prev_carry);
8484 rorq(value, 0x20);
8485 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
8486
8487 // Set previous carry = new carry
8488 movl(prev_carry, new_carry);
8489 }
8490 jmp(L_fifth_loop);
8491
8492 bind(L_fifth_loop_exit);
8493}
8494
8495
8496/**
8497 * Code for BigInteger::squareToLen() intrinsic
8498 *
8499 * rdi: x
8500 * rsi: len
8501 * r8: z
8502 * rcx: zlen
8503 * r12: tmp1
8504 * r13: tmp2
8505 * r14: tmp3
8506 * r15: tmp4
8507 * rbx: tmp5
8508 *
8509 */
8510void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8511
8512 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8513 push(tmp1);
8514 push(tmp2);
8515 push(tmp3);
8516 push(tmp4);
8517 push(tmp5);
8518
8519 // First loop
8520 // Store the squares, right shifted one bit (i.e., divided by 2).
8521 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8522
8523 // Add in off-diagonal sums.
8524 //
8525 // Second, third (nested) and fourth loops.
8526 // zlen +=2;
8527 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8528 // carry = 0;
8529 // long op2 = x[xidx:xidx+1];
8530 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8531 // k -= 2;
8532 // long op1 = x[j:j+1];
8533 // long sum = z[k:k+1];
8534 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8535 // z[k:k+1] = sum;
8536 // }
8537 // add_one_64(z, k, carry, tmp_regs);
8538 // }
8539
8540 const Register carry = tmp5;
8541 const Register sum = tmp3;
8542 const Register op1 = tmp4;
8543 Register op2 = tmp2;
8544
8545 push(zlen);
8546 push(len);
8547 addl(zlen,2);
8548 bind(L_second_loop);
8549 xorq(carry, carry);
8550 subl(zlen, 4);
8551 subl(len, 2);
8552 push(zlen);
8553 push(len);
8554 cmpl(len, 0);
8555 jccb(Assembler::lessEqual, L_second_loop_exit);
8556
8557 // Multiply an array by one 64 bit long.
8558 if (UseBMI2Instructions) {
8559 op2 = rdxReg;
8560 movq(op2, Address(x, len, Address::times_4, 0));
8561 rorxq(op2, op2, 32);
8562 }
8563 else {
8564 movq(op2, Address(x, len, Address::times_4, 0));
8565 rorq(op2, 32);
8566 }
8567
8568 bind(L_third_loop);
8569 decrementl(len);
8570 jccb(Assembler::negative, L_third_loop_exit);
8571 decrementl(len);
8572 jccb(Assembler::negative, L_last_x);
8573
8574 movq(op1, Address(x, len, Address::times_4, 0));
8575 rorq(op1, 32);
8576
8577 bind(L_multiply);
8578 subl(zlen, 2);
8579 movq(sum, Address(z, zlen, Address::times_4, 0));
8580
8581 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8582 if (UseBMI2Instructions) {
8583 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8584 }
8585 else {
8586 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8587 }
8588
8589 movq(Address(z, zlen, Address::times_4, 0), sum);
8590
8591 jmp(L_third_loop);
8592 bind(L_third_loop_exit);
8593
8594 // Fourth loop
8595 // Add 64 bit long carry into z with carry propogation.
8596 // Uses offsetted zlen.
8597 add_one_64(z, zlen, carry, tmp1);
8598
8599 pop(len);
8600 pop(zlen);
8601 jmp(L_second_loop);
8602
8603 // Next infrequent code is moved outside loops.
8604 bind(L_last_x);
8605 movl(op1, Address(x, 0));
8606 jmp(L_multiply);
8607
8608 bind(L_second_loop_exit);
8609 pop(len);
8610 pop(zlen);
8611 pop(len);
8612 pop(zlen);
8613
8614 // Fifth loop
8615 // Shift z left 1 bit.
8616 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8617
8618 // z[zlen-1] |= x[len-1] & 1;
8619 movl(tmp3, Address(x, len, Address::times_4, -4));
8620 andl(tmp3, 1);
8621 orl(Address(z, zlen, Address::times_4, -4), tmp3);
8622
8623 pop(tmp5);
8624 pop(tmp4);
8625 pop(tmp3);
8626 pop(tmp2);
8627 pop(tmp1);
8628}
8629
8630/**
8631 * Helper function for mul_add()
8632 * Multiply the in[] by int k and add to out[] starting at offset offs using
8633 * 128 bit by 32 bit multiply and return the carry in tmp5.
8634 * Only quad int aligned length of in[] is operated on in this function.
8635 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8636 * This function preserves out, in and k registers.
8637 * len and offset point to the appropriate index in "in" & "out" correspondingly
8638 * tmp5 has the carry.
8639 * other registers are temporary and are modified.
8640 *
8641 */
8642void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8643 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8644 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8645
8646 Label L_first_loop, L_first_loop_exit;
8647
8648 movl(tmp1, len);
8649 shrl(tmp1, 2);
8650
8651 bind(L_first_loop);
8652 subl(tmp1, 1);
8653 jccb(Assembler::negative, L_first_loop_exit);
8654
8655 subl(len, 4);
8656 subl(offset, 4);
8657
8658 Register op2 = tmp2;
8659 const Register sum = tmp3;
8660 const Register op1 = tmp4;
8661 const Register carry = tmp5;
8662
8663 if (UseBMI2Instructions) {
8664 op2 = rdxReg;
8665 }
8666
8667 movq(op1, Address(in, len, Address::times_4, 8));
8668 rorq(op1, 32);
8669 movq(sum, Address(out, offset, Address::times_4, 8));
8670 rorq(sum, 32);
8671 if (UseBMI2Instructions) {
8672 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8673 }
8674 else {
8675 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8676 }
8677 // Store back in big endian from little endian
8678 rorq(sum, 0x20);
8679 movq(Address(out, offset, Address::times_4, 8), sum);
8680
8681 movq(op1, Address(in, len, Address::times_4, 0));
8682 rorq(op1, 32);
8683 movq(sum, Address(out, offset, Address::times_4, 0));
8684 rorq(sum, 32);
8685 if (UseBMI2Instructions) {
8686 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8687 }
8688 else {
8689 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8690 }
8691 // Store back in big endian from little endian
8692 rorq(sum, 0x20);
8693 movq(Address(out, offset, Address::times_4, 0), sum);
8694
8695 jmp(L_first_loop);
8696 bind(L_first_loop_exit);
8697}
8698
8699/**
8700 * Code for BigInteger::mulAdd() intrinsic
8701 *
8702 * rdi: out
8703 * rsi: in
8704 * r11: offs (out.length - offset)
8705 * rcx: len
8706 * r8: k
8707 * r12: tmp1
8708 * r13: tmp2
8709 * r14: tmp3
8710 * r15: tmp4
8711 * rbx: tmp5
8712 * Multiply the in[] by word k and add to out[], return the carry in rax
8713 */
8714void MacroAssembler::mul_add(Register out, Register in, Register offs,
8715 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8716 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8717
8718 Label L_carry, L_last_in, L_done;
8719
8720// carry = 0;
8721// for (int j=len-1; j >= 0; j--) {
8722// long product = (in[j] & LONG_MASK) * kLong +
8723// (out[offs] & LONG_MASK) + carry;
8724// out[offs--] = (int)product;
8725// carry = product >>> 32;
8726// }
8727//
8728 push(tmp1);
8729 push(tmp2);
8730 push(tmp3);
8731 push(tmp4);
8732 push(tmp5);
8733
8734 Register op2 = tmp2;
8735 const Register sum = tmp3;
8736 const Register op1 = tmp4;
8737 const Register carry = tmp5;
8738
8739 if (UseBMI2Instructions) {
8740 op2 = rdxReg;
8741 movl(op2, k);
8742 }
8743 else {
8744 movl(op2, k);
8745 }
8746
8747 xorq(carry, carry);
8748
8749 //First loop
8750
8751 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8752 //The carry is in tmp5
8753 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8754
8755 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8756 decrementl(len);
8757 jccb(Assembler::negative, L_carry);
8758 decrementl(len);
8759 jccb(Assembler::negative, L_last_in);
8760
8761 movq(op1, Address(in, len, Address::times_4, 0));
8762 rorq(op1, 32);
8763
8764 subl(offs, 2);
8765 movq(sum, Address(out, offs, Address::times_4, 0));
8766 rorq(sum, 32);
8767
8768 if (UseBMI2Instructions) {
8769 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8770 }
8771 else {
8772 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8773 }
8774
8775 // Store back in big endian from little endian
8776 rorq(sum, 0x20);
8777 movq(Address(out, offs, Address::times_4, 0), sum);
8778
8779 testl(len, len);
8780 jccb(Assembler::zero, L_carry);
8781
8782 //Multiply the last in[] entry, if any
8783 bind(L_last_in);
8784 movl(op1, Address(in, 0));
8785 movl(sum, Address(out, offs, Address::times_4, -4));
8786
8787 movl(raxReg, k);
8788 mull(op1); //tmp4 * eax -> edx:eax
8789 addl(sum, carry);
8790 adcl(rdxReg, 0);
8791 addl(sum, raxReg);
8792 adcl(rdxReg, 0);
8793 movl(carry, rdxReg);
8794
8795 movl(Address(out, offs, Address::times_4, -4), sum);
8796
8797 bind(L_carry);
8798 //return tmp5/carry as carry in rax
8799 movl(rax, carry);
8800
8801 bind(L_done);
8802 pop(tmp5);
8803 pop(tmp4);
8804 pop(tmp3);
8805 pop(tmp2);
8806 pop(tmp1);
8807}
8808#endif
8809
8810/**
8811 * Emits code to update CRC-32 with a byte value according to constants in table
8812 *
8813 * @param [in,out]crc Register containing the crc.
8814 * @param [in]val Register containing the byte to fold into the CRC.
8815 * @param [in]table Register containing the table of crc constants.
8816 *
8817 * uint32_t crc;
8818 * val = crc_table[(val ^ crc) & 0xFF];
8819 * crc = val ^ (crc >> 8);
8820 *
8821 */
8822void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8823 xorl(val, crc);
8824 andl(val, 0xFF);
8825 shrl(crc, 8); // unsigned shift
8826 xorl(crc, Address(table, val, Address::times_4, 0));
8827}
8828
8829/**
8830* Fold four 128-bit data chunks
8831*/
8832void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8833 evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
8834 evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
8835 evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
8836 evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
8837}
8838
8839/**
8840 * Fold 128-bit data chunk
8841 */
8842void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8843 if (UseAVX > 0) {
8844 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8845 vpclmulldq(xcrc, xK, xcrc); // [63:0]
8846 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8847 pxor(xcrc, xtmp);
8848 } else {
8849 movdqa(xtmp, xcrc);
8850 pclmulhdq(xtmp, xK); // [123:64]
8851 pclmulldq(xcrc, xK); // [63:0]
8852 pxor(xcrc, xtmp);
8853 movdqu(xtmp, Address(buf, offset));
8854 pxor(xcrc, xtmp);
8855 }
8856}
8857
8858void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8859 if (UseAVX > 0) {
8860 vpclmulhdq(xtmp, xK, xcrc);
8861 vpclmulldq(xcrc, xK, xcrc);
8862 pxor(xcrc, xbuf);
8863 pxor(xcrc, xtmp);
8864 } else {
8865 movdqa(xtmp, xcrc);
8866 pclmulhdq(xtmp, xK);
8867 pclmulldq(xcrc, xK);
8868 pxor(xcrc, xbuf);
8869 pxor(xcrc, xtmp);
8870 }
8871}
8872
8873/**
8874 * 8-bit folds to compute 32-bit CRC
8875 *
8876 * uint64_t xcrc;
8877 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8878 */
8879void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8880 movdl(tmp, xcrc);
8881 andl(tmp, 0xFF);
8882 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8883 psrldq(xcrc, 1); // unsigned shift one byte
8884 pxor(xcrc, xtmp);
8885}
8886
8887/**
8888 * uint32_t crc;
8889 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8890 */
8891void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8892 movl(tmp, crc);
8893 andl(tmp, 0xFF);
8894 shrl(crc, 8);
8895 xorl(crc, Address(table, tmp, Address::times_4, 0));
8896}
8897
8898/**
8899 * @param crc register containing existing CRC (32-bit)
8900 * @param buf register pointing to input byte buffer (byte*)
8901 * @param len register containing number of bytes
8902 * @param table register that will contain address of CRC table
8903 * @param tmp scratch register
8904 */
8905void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8906 assert_different_registers(crc, buf, len, table, tmp, rax);
8907
8908 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8909 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8910
8911 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8912 // context for the registers used, where all instructions below are using 128-bit mode
8913 // On EVEX without VL and BW, these instructions will all be AVX.
8914 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8915 notl(crc); // ~crc
8916 cmpl(len, 16);
8917 jcc(Assembler::less, L_tail);
8918
8919 // Align buffer to 16 bytes
8920 movl(tmp, buf);
8921 andl(tmp, 0xF);
8922 jccb(Assembler::zero, L_aligned);
8923 subl(tmp, 16);
8924 addl(len, tmp);
8925
8926 align(4);
8927 BIND(L_align_loop);
8928 movsbl(rax, Address(buf, 0)); // load byte with sign extension
8929 update_byte_crc32(crc, rax, table);
8930 increment(buf);
8931 incrementl(tmp);
8932 jccb(Assembler::less, L_align_loop);
8933
8934 BIND(L_aligned);
8935 movl(tmp, len); // save
8936 shrl(len, 4);
8937 jcc(Assembler::zero, L_tail_restore);
8938
8939 // Fold total 512 bits of polynomial on each iteration
8940 if (VM_Version::supports_vpclmulqdq()) {
8941 Label Parallel_loop, L_No_Parallel;
8942
8943 cmpl(len, 8);
8944 jccb(Assembler::less, L_No_Parallel);
8945
8946 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8947 evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
8948 movdl(xmm5, crc);
8949 evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
8950 addptr(buf, 64);
8951 subl(len, 7);
8952 evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
8953
8954 BIND(Parallel_loop);
8955 fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
8956 addptr(buf, 64);
8957 subl(len, 4);
8958 jcc(Assembler::greater, Parallel_loop);
8959
8960 vextracti64x2(xmm2, xmm1, 0x01);
8961 vextracti64x2(xmm3, xmm1, 0x02);
8962 vextracti64x2(xmm4, xmm1, 0x03);
8963 jmp(L_fold_512b);
8964
8965 BIND(L_No_Parallel);
8966 }
8967 // Fold crc into first bytes of vector
8968 movdqa(xmm1, Address(buf, 0));
8969 movdl(rax, xmm1);
8970 xorl(crc, rax);
8971 if (VM_Version::supports_sse4_1()) {
8972 pinsrd(xmm1, crc, 0);
8973 } else {
8974 pinsrw(xmm1, crc, 0);
8975 shrl(crc, 16);
8976 pinsrw(xmm1, crc, 1);
8977 }
8978 addptr(buf, 16);
8979 subl(len, 4); // len > 0
8980 jcc(Assembler::less, L_fold_tail);
8981
8982 movdqa(xmm2, Address(buf, 0));
8983 movdqa(xmm3, Address(buf, 16));
8984 movdqa(xmm4, Address(buf, 32));
8985 addptr(buf, 48);
8986 subl(len, 3);
8987 jcc(Assembler::lessEqual, L_fold_512b);
8988
8989 // Fold total 512 bits of polynomial on each iteration,
8990 // 128 bits per each of 4 parallel streams.
8991 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8992
8993 align(32);
8994 BIND(L_fold_512b_loop);
8995 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8996 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8997 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8998 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8999 addptr(buf, 64);
9000 subl(len, 4);
9001 jcc(Assembler::greater, L_fold_512b_loop);
9002
9003 // Fold 512 bits to 128 bits.
9004 BIND(L_fold_512b);
9005 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9006 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9007 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9008 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9009
9010 // Fold the rest of 128 bits data chunks
9011 BIND(L_fold_tail);
9012 addl(len, 3);
9013 jccb(Assembler::lessEqual, L_fold_128b);
9014 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9015
9016 BIND(L_fold_tail_loop);
9017 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
9018 addptr(buf, 16);
9019 decrementl(len);
9020 jccb(Assembler::greater, L_fold_tail_loop);
9021
9022 // Fold 128 bits in xmm1 down into 32 bits in crc register.
9023 BIND(L_fold_128b);
9024 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9025 if (UseAVX > 0) {
9026 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9027 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9028 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9029 } else {
9030 movdqa(xmm2, xmm0);
9031 pclmulqdq(xmm2, xmm1, 0x1);
9032 movdqa(xmm3, xmm0);
9033 pand(xmm3, xmm2);
9034 pclmulqdq(xmm0, xmm3, 0x1);
9035 }
9036 psrldq(xmm1, 8);
9037 psrldq(xmm2, 4);
9038 pxor(xmm0, xmm1);
9039 pxor(xmm0, xmm2);
9040
9041 // 8 8-bit folds to compute 32-bit CRC.
9042 for (int j = 0; j < 4; j++) {
9043 fold_8bit_crc32(xmm0, table, xmm1, rax);
9044 }
9045 movdl(crc, xmm0); // mov 32 bits to general register
9046 for (int j = 0; j < 4; j++) {
9047 fold_8bit_crc32(crc, table, rax);
9048 }
9049
9050 BIND(L_tail_restore);
9051 movl(len, tmp); // restore
9052 BIND(L_tail);
9053 andl(len, 0xf);
9054 jccb(Assembler::zero, L_exit);
9055
9056 // Fold the rest of bytes
9057 align(4);
9058 BIND(L_tail_loop);
9059 movsbl(rax, Address(buf, 0)); // load byte with sign extension
9060 update_byte_crc32(crc, rax, table);
9061 increment(buf);
9062 decrementl(len);
9063 jccb(Assembler::greater, L_tail_loop);
9064
9065 BIND(L_exit);
9066 notl(crc); // ~c
9067}
9068
9069#ifdef _LP64
9070// S. Gueron / Information Processing Letters 112 (2012) 184
9071// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9072// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9073// Output: the 64-bit carry-less product of B * CONST
9074void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9075 Register tmp1, Register tmp2, Register tmp3) {
9076 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9077 if (n > 0) {
9078 addq(tmp3, n * 256 * 8);
9079 }
9080 // Q1 = TABLEExt[n][B & 0xFF];
9081 movl(tmp1, in);
9082 andl(tmp1, 0x000000FF);
9083 shll(tmp1, 3);
9084 addq(tmp1, tmp3);
9085 movq(tmp1, Address(tmp1, 0));
9086
9087 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
9088 movl(tmp2, in);
9089 shrl(tmp2, 8);
9090 andl(tmp2, 0x000000FF);
9091 shll(tmp2, 3);
9092 addq(tmp2, tmp3);
9093 movq(tmp2, Address(tmp2, 0));
9094
9095 shlq(tmp2, 8);
9096 xorq(tmp1, tmp2);
9097
9098 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
9099 movl(tmp2, in);
9100 shrl(tmp2, 16);
9101 andl(tmp2, 0x000000FF);
9102 shll(tmp2, 3);
9103 addq(tmp2, tmp3);
9104 movq(tmp2, Address(tmp2, 0));
9105
9106 shlq(tmp2, 16);
9107 xorq(tmp1, tmp2);
9108
9109 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
9110 shrl(in, 24);
9111 andl(in, 0x000000FF);
9112 shll(in, 3);
9113 addq(in, tmp3);
9114 movq(in, Address(in, 0));
9115
9116 shlq(in, 24);
9117 xorq(in, tmp1);
9118 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9119}
9120
9121void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9122 Register in_out,
9123 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9124 XMMRegister w_xtmp2,
9125 Register tmp1,
9126 Register n_tmp2, Register n_tmp3) {
9127 if (is_pclmulqdq_supported) {
9128 movdl(w_xtmp1, in_out); // modified blindly
9129
9130 movl(tmp1, const_or_pre_comp_const_index);
9131 movdl(w_xtmp2, tmp1);
9132 pclmulqdq(w_xtmp1, w_xtmp2, 0);
9133
9134 movdq(in_out, w_xtmp1);
9135 } else {
9136 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9137 }
9138}
9139
9140// Recombination Alternative 2: No bit-reflections
9141// T1 = (CRC_A * U1) << 1
9142// T2 = (CRC_B * U2) << 1
9143// C1 = T1 >> 32
9144// C2 = T2 >> 32
9145// T1 = T1 & 0xFFFFFFFF
9146// T2 = T2 & 0xFFFFFFFF
9147// T1 = CRC32(0, T1)
9148// T2 = CRC32(0, T2)
9149// C1 = C1 ^ T1
9150// C2 = C2 ^ T2
9151// CRC = C1 ^ C2 ^ CRC_C
9152void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9153 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9154 Register tmp1, Register tmp2,
9155 Register n_tmp3) {
9156 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9157 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9158 shlq(in_out, 1);
9159 movl(tmp1, in_out);
9160 shrq(in_out, 32);
9161 xorl(tmp2, tmp2);
9162 crc32(tmp2, tmp1, 4);
9163 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9164 shlq(in1, 1);
9165 movl(tmp1, in1);
9166 shrq(in1, 32);
9167 xorl(tmp2, tmp2);
9168 crc32(tmp2, tmp1, 4);
9169 xorl(in1, tmp2);
9170 xorl(in_out, in1);
9171 xorl(in_out, in2);
9172}
9173
9174// Set N to predefined value
9175// Subtract from a lenght of a buffer
9176// execute in a loop:
9177// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9178// for i = 1 to N do
9179// CRC_A = CRC32(CRC_A, A[i])
9180// CRC_B = CRC32(CRC_B, B[i])
9181// CRC_C = CRC32(CRC_C, C[i])
9182// end for
9183// Recombine
9184void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9185 Register in_out1, Register in_out2, Register in_out3,
9186 Register tmp1, Register tmp2, Register tmp3,
9187 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9188 Register tmp4, Register tmp5,
9189 Register n_tmp6) {
9190 Label L_processPartitions;
9191 Label L_processPartition;
9192 Label L_exit;
9193
9194 bind(L_processPartitions);
9195 cmpl(in_out1, 3 * size);
9196 jcc(Assembler::less, L_exit);
9197 xorl(tmp1, tmp1);
9198 xorl(tmp2, tmp2);
9199 movq(tmp3, in_out2);
9200 addq(tmp3, size);
9201
9202 bind(L_processPartition);
9203 crc32(in_out3, Address(in_out2, 0), 8);
9204 crc32(tmp1, Address(in_out2, size), 8);
9205 crc32(tmp2, Address(in_out2, size * 2), 8);
9206 addq(in_out2, 8);
9207 cmpq(in_out2, tmp3);
9208 jcc(Assembler::less, L_processPartition);
9209 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9210 w_xtmp1, w_xtmp2, w_xtmp3,
9211 tmp4, tmp5,
9212 n_tmp6);
9213 addq(in_out2, 2 * size);
9214 subl(in_out1, 3 * size);
9215 jmp(L_processPartitions);
9216
9217 bind(L_exit);
9218}
9219#else
9220void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9221 Register tmp1, Register tmp2, Register tmp3,
9222 XMMRegister xtmp1, XMMRegister xtmp2) {
9223 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9224 if (n > 0) {
9225 addl(tmp3, n * 256 * 8);
9226 }
9227 // Q1 = TABLEExt[n][B & 0xFF];
9228 movl(tmp1, in_out);
9229 andl(tmp1, 0x000000FF);
9230 shll(tmp1, 3);
9231 addl(tmp1, tmp3);
9232 movq(xtmp1, Address(tmp1, 0));
9233
9234 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
9235 movl(tmp2, in_out);
9236 shrl(tmp2, 8);
9237 andl(tmp2, 0x000000FF);
9238 shll(tmp2, 3);
9239 addl(tmp2, tmp3);
9240 movq(xtmp2, Address(tmp2, 0));
9241
9242 psllq(xtmp2, 8);
9243 pxor(xtmp1, xtmp2);
9244
9245 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
9246 movl(tmp2, in_out);
9247 shrl(tmp2, 16);
9248 andl(tmp2, 0x000000FF);
9249 shll(tmp2, 3);
9250 addl(tmp2, tmp3);
9251 movq(xtmp2, Address(tmp2, 0));
9252
9253 psllq(xtmp2, 16);
9254 pxor(xtmp1, xtmp2);
9255
9256 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
9257 shrl(in_out, 24);
9258 andl(in_out, 0x000000FF);
9259 shll(in_out, 3);
9260 addl(in_out, tmp3);
9261 movq(xtmp2, Address(in_out, 0));
9262
9263 psllq(xtmp2, 24);
9264 pxor(xtmp1, xtmp2); // Result in CXMM
9265 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9266}
9267
9268void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9269 Register in_out,
9270 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9271 XMMRegister w_xtmp2,
9272 Register tmp1,
9273 Register n_tmp2, Register n_tmp3) {
9274 if (is_pclmulqdq_supported) {
9275 movdl(w_xtmp1, in_out);
9276
9277 movl(tmp1, const_or_pre_comp_const_index);
9278 movdl(w_xtmp2, tmp1);
9279 pclmulqdq(w_xtmp1, w_xtmp2, 0);
9280 // Keep result in XMM since GPR is 32 bit in length
9281 } else {
9282 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9283 }
9284}
9285
9286void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9287 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9288 Register tmp1, Register tmp2,
9289 Register n_tmp3) {
9290 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9291 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9292
9293 psllq(w_xtmp1, 1);
9294 movdl(tmp1, w_xtmp1);
9295 psrlq(w_xtmp1, 32);
9296 movdl(in_out, w_xtmp1);
9297
9298 xorl(tmp2, tmp2);
9299 crc32(tmp2, tmp1, 4);
9300 xorl(in_out, tmp2);
9301
9302 psllq(w_xtmp2, 1);
9303 movdl(tmp1, w_xtmp2);
9304 psrlq(w_xtmp2, 32);
9305 movdl(in1, w_xtmp2);
9306
9307 xorl(tmp2, tmp2);
9308 crc32(tmp2, tmp1, 4);
9309 xorl(in1, tmp2);
9310 xorl(in_out, in1);
9311 xorl(in_out, in2);
9312}
9313
9314void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9315 Register in_out1, Register in_out2, Register in_out3,
9316 Register tmp1, Register tmp2, Register tmp3,
9317 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9318 Register tmp4, Register tmp5,
9319 Register n_tmp6) {
9320 Label L_processPartitions;
9321 Label L_processPartition;
9322 Label L_exit;
9323
9324 bind(L_processPartitions);
9325 cmpl(in_out1, 3 * size);
9326 jcc(Assembler::less, L_exit);
9327 xorl(tmp1, tmp1);
9328 xorl(tmp2, tmp2);
9329 movl(tmp3, in_out2);
9330 addl(tmp3, size);
9331
9332 bind(L_processPartition);
9333 crc32(in_out3, Address(in_out2, 0), 4);
9334 crc32(tmp1, Address(in_out2, size), 4);
9335 crc32(tmp2, Address(in_out2, size*2), 4);
9336 crc32(in_out3, Address(in_out2, 0+4), 4);
9337 crc32(tmp1, Address(in_out2, size+4), 4);
9338 crc32(tmp2, Address(in_out2, size*2+4), 4);
9339 addl(in_out2, 8);
9340 cmpl(in_out2, tmp3);
9341 jcc(Assembler::less, L_processPartition);
9342
9343 push(tmp3);
9344 push(in_out1);
9345 push(in_out2);
9346 tmp4 = tmp3;
9347 tmp5 = in_out1;
9348 n_tmp6 = in_out2;
9349
9350 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9351 w_xtmp1, w_xtmp2, w_xtmp3,
9352 tmp4, tmp5,
9353 n_tmp6);
9354
9355 pop(in_out2);
9356 pop(in_out1);
9357 pop(tmp3);
9358
9359 addl(in_out2, 2 * size);
9360 subl(in_out1, 3 * size);
9361 jmp(L_processPartitions);
9362
9363 bind(L_exit);
9364}
9365#endif //LP64
9366
9367#ifdef _LP64
9368// Algorithm 2: Pipelined usage of the CRC32 instruction.
9369// Input: A buffer I of L bytes.
9370// Output: the CRC32C value of the buffer.
9371// Notations:
9372// Write L = 24N + r, with N = floor (L/24).
9373// r = L mod 24 (0 <= r < 24).
9374// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9375// N quadwords, and R consists of r bytes.
9376// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9377// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9378// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9379// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9380void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9381 Register tmp1, Register tmp2, Register tmp3,
9382 Register tmp4, Register tmp5, Register tmp6,
9383 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9384 bool is_pclmulqdq_supported) {
9385 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9386 Label L_wordByWord;
9387 Label L_byteByByteProlog;
9388 Label L_byteByByte;
9389 Label L_exit;
9390
9391 if (is_pclmulqdq_supported ) {
9392 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9393 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9394
9395 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9396 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9397
9398 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9399 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9400 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9401 } else {
9402 const_or_pre_comp_const_index[0] = 1;
9403 const_or_pre_comp_const_index[1] = 0;
9404
9405 const_or_pre_comp_const_index[2] = 3;
9406 const_or_pre_comp_const_index[3] = 2;
9407
9408 const_or_pre_comp_const_index[4] = 5;
9409 const_or_pre_comp_const_index[5] = 4;
9410 }
9411 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9412 in2, in1, in_out,
9413 tmp1, tmp2, tmp3,
9414 w_xtmp1, w_xtmp2, w_xtmp3,
9415 tmp4, tmp5,
9416 tmp6);
9417 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9418 in2, in1, in_out,
9419 tmp1, tmp2, tmp3,
9420 w_xtmp1, w_xtmp2, w_xtmp3,
9421 tmp4, tmp5,
9422 tmp6);
9423 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9424 in2, in1, in_out,
9425 tmp1, tmp2, tmp3,
9426 w_xtmp1, w_xtmp2, w_xtmp3,
9427 tmp4, tmp5,
9428 tmp6);
9429 movl(tmp1, in2);
9430 andl(tmp1, 0x00000007);
9431 negl(tmp1);
9432 addl(tmp1, in2);
9433 addq(tmp1, in1);
9434
9435 BIND(L_wordByWord);
9436 cmpq(in1, tmp1);
9437 jcc(Assembler::greaterEqual, L_byteByByteProlog);
9438 crc32(in_out, Address(in1, 0), 4);
9439 addq(in1, 4);
9440 jmp(L_wordByWord);
9441
9442 BIND(L_byteByByteProlog);
9443 andl(in2, 0x00000007);
9444 movl(tmp2, 1);
9445
9446 BIND(L_byteByByte);
9447 cmpl(tmp2, in2);
9448 jccb(Assembler::greater, L_exit);
9449 crc32(in_out, Address(in1, 0), 1);
9450 incq(in1);
9451 incl(tmp2);
9452 jmp(L_byteByByte);
9453
9454 BIND(L_exit);
9455}
9456#else
9457void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9458 Register tmp1, Register tmp2, Register tmp3,
9459 Register tmp4, Register tmp5, Register tmp6,
9460 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9461 bool is_pclmulqdq_supported) {
9462 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9463 Label L_wordByWord;
9464 Label L_byteByByteProlog;
9465 Label L_byteByByte;
9466 Label L_exit;
9467
9468 if (is_pclmulqdq_supported) {
9469 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9470 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9471
9472 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9473 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9474
9475 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9476 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9477 } else {
9478 const_or_pre_comp_const_index[0] = 1;
9479 const_or_pre_comp_const_index[1] = 0;
9480
9481 const_or_pre_comp_const_index[2] = 3;
9482 const_or_pre_comp_const_index[3] = 2;
9483
9484 const_or_pre_comp_const_index[4] = 5;
9485 const_or_pre_comp_const_index[5] = 4;
9486 }
9487 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9488 in2, in1, in_out,
9489 tmp1, tmp2, tmp3,
9490 w_xtmp1, w_xtmp2, w_xtmp3,
9491 tmp4, tmp5,
9492 tmp6);
9493 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9494 in2, in1, in_out,
9495 tmp1, tmp2, tmp3,
9496 w_xtmp1, w_xtmp2, w_xtmp3,
9497 tmp4, tmp5,
9498 tmp6);
9499 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9500 in2, in1, in_out,
9501 tmp1, tmp2, tmp3,
9502 w_xtmp1, w_xtmp2, w_xtmp3,
9503 tmp4, tmp5,
9504 tmp6);
9505 movl(tmp1, in2);
9506 andl(tmp1, 0x00000007);
9507 negl(tmp1);
9508 addl(tmp1, in2);
9509 addl(tmp1, in1);
9510
9511 BIND(L_wordByWord);
9512 cmpl(in1, tmp1);
9513 jcc(Assembler::greaterEqual, L_byteByByteProlog);
9514 crc32(in_out, Address(in1,0), 4);
9515 addl(in1, 4);
9516 jmp(L_wordByWord);
9517
9518 BIND(L_byteByByteProlog);
9519 andl(in2, 0x00000007);
9520 movl(tmp2, 1);
9521
9522 BIND(L_byteByByte);
9523 cmpl(tmp2, in2);
9524 jccb(Assembler::greater, L_exit);
9525 movb(tmp1, Address(in1, 0));
9526 crc32(in_out, tmp1, 1);
9527 incl(in1);
9528 incl(tmp2);
9529 jmp(L_byteByByte);
9530
9531 BIND(L_exit);
9532}
9533#endif // LP64
9534#undef BIND
9535#undef BLOCK_COMMENT
9536
9537// Compress char[] array to byte[].
9538// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
9539// @HotSpotIntrinsicCandidate
9540// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9541// for (int i = 0; i < len; i++) {
9542// int c = src[srcOff++];
9543// if (c >>> 8 != 0) {
9544// return 0;
9545// }
9546// dst[dstOff++] = (byte)c;
9547// }
9548// return len;
9549// }
9550void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9551 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9552 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9553 Register tmp5, Register result) {
9554 Label copy_chars_loop, return_length, return_zero, done;
9555
9556 // rsi: src
9557 // rdi: dst
9558 // rdx: len
9559 // rcx: tmp5
9560 // rax: result
9561
9562 // rsi holds start addr of source char[] to be compressed
9563 // rdi holds start addr of destination byte[]
9564 // rdx holds length
9565
9566 assert(len != result, "");
9567
9568 // save length for return
9569 push(len);
9570
9571 if ((UseAVX > 2) && // AVX512
9572 VM_Version::supports_avx512vlbw() &&
9573 VM_Version::supports_bmi2()) {
9574
9575 Label copy_32_loop, copy_loop_tail, below_threshold;
9576
9577 // alignment
9578 Label post_alignment;
9579
9580 // if length of the string is less than 16, handle it in an old fashioned way
9581 testl(len, -32);
9582 jcc(Assembler::zero, below_threshold);
9583
9584 // First check whether a character is compressable ( <= 0xFF).
9585 // Create mask to test for Unicode chars inside zmm vector
9586 movl(result, 0x00FF);
9587 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
9588
9589 testl(len, -64);
9590 jcc(Assembler::zero, post_alignment);
9591
9592 movl(tmp5, dst);
9593 andl(tmp5, (32 - 1));
9594 negl(tmp5);
9595 andl(tmp5, (32 - 1));
9596
9597 // bail out when there is nothing to be done
9598 testl(tmp5, 0xFFFFFFFF);
9599 jcc(Assembler::zero, post_alignment);
9600
9601 // ~(~0 << len), where len is the # of remaining elements to process
9602 movl(result, 0xFFFFFFFF);
9603 shlxl(result, result, tmp5);
9604 notl(result);
9605 kmovdl(k3, result);
9606
9607 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9608 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9609 ktestd(k2, k3);
9610 jcc(Assembler::carryClear, return_zero);
9611
9612 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9613
9614 addptr(src, tmp5);
9615 addptr(src, tmp5);
9616 addptr(dst, tmp5);
9617 subl(len, tmp5);
9618
9619 bind(post_alignment);
9620 // end of alignment
9621
9622 movl(tmp5, len);
9623 andl(tmp5, (32 - 1)); // tail count (in chars)
9624 andl(len, ~(32 - 1)); // vector count (in chars)
9625 jcc(Assembler::zero, copy_loop_tail);
9626
9627 lea(src, Address(src, len, Address::times_2));
9628 lea(dst, Address(dst, len, Address::times_1));
9629 negptr(len);
9630
9631 bind(copy_32_loop);
9632 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9633 evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9634 kortestdl(k2, k2);
9635 jcc(Assembler::carryClear, return_zero);
9636
9637 // All elements in current processed chunk are valid candidates for
9638 // compression. Write a truncated byte elements to the memory.
9639 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9640 addptr(len, 32);
9641 jcc(Assembler::notZero, copy_32_loop);
9642
9643 bind(copy_loop_tail);
9644 // bail out when there is nothing to be done
9645 testl(tmp5, 0xFFFFFFFF);
9646 jcc(Assembler::zero, return_length);
9647
9648 movl(len, tmp5);
9649
9650 // ~(~0 << len), where len is the # of remaining elements to process
9651 movl(result, 0xFFFFFFFF);
9652 shlxl(result, result, len);
9653 notl(result);
9654
9655 kmovdl(k3, result);
9656
9657 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9658 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9659 ktestd(k2, k3);
9660 jcc(Assembler::carryClear, return_zero);
9661
9662 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9663 jmp(return_length);
9664
9665 bind(below_threshold);
9666 }
9667
9668 if (UseSSE42Intrinsics) {
9669 Label copy_32_loop, copy_16, copy_tail;
9670
9671 movl(result, len);
9672
9673 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
9674
9675 // vectored compression
9676 andl(len, 0xfffffff0); // vector count (in chars)
9677 andl(result, 0x0000000f); // tail count (in chars)
9678 testl(len, len);
9679 jcc(Assembler::zero, copy_16);
9680
9681 // compress 16 chars per iter
9682 movdl(tmp1Reg, tmp5);
9683 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
9684 pxor(tmp4Reg, tmp4Reg);
9685
9686 lea(src, Address(src, len, Address::times_2));
9687 lea(dst, Address(dst, len, Address::times_1));
9688 negptr(len);
9689
9690 bind(copy_32_loop);
9691 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
9692 por(tmp4Reg, tmp2Reg);
9693 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9694 por(tmp4Reg, tmp3Reg);
9695 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
9696 jcc(Assembler::notZero, return_zero);
9697 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
9698 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9699 addptr(len, 16);
9700 jcc(Assembler::notZero, copy_32_loop);
9701
9702 // compress next vector of 8 chars (if any)
9703 bind(copy_16);
9704 movl(len, result);
9705 andl(len, 0xfffffff8); // vector count (in chars)
9706 andl(result, 0x00000007); // tail count (in chars)
9707 testl(len, len);
9708 jccb(Assembler::zero, copy_tail);
9709
9710 movdl(tmp1Reg, tmp5);
9711 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
9712 pxor(tmp3Reg, tmp3Reg);
9713
9714 movdqu(tmp2Reg, Address(src, 0));
9715 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
9716 jccb(Assembler::notZero, return_zero);
9717 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
9718 movq(Address(dst, 0), tmp2Reg);
9719 addptr(src, 16);
9720 addptr(dst, 8);
9721
9722 bind(copy_tail);
9723 movl(len, result);
9724 }
9725 // compress 1 char per iter
9726 testl(len, len);
9727 jccb(Assembler::zero, return_length);
9728 lea(src, Address(src, len, Address::times_2));
9729 lea(dst, Address(dst, len, Address::times_1));
9730 negptr(len);
9731
9732 bind(copy_chars_loop);
9733 load_unsigned_short(result, Address(src, len, Address::times_2));
9734 testl(result, 0xff00); // check if Unicode char
9735 jccb(Assembler::notZero, return_zero);
9736 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
9737 increment(len);
9738 jcc(Assembler::notZero, copy_chars_loop);
9739
9740 // if compression succeeded, return length
9741 bind(return_length);
9742 pop(result);
9743 jmpb(done);
9744
9745 // if compression failed, return 0
9746 bind(return_zero);
9747 xorl(result, result);
9748 addptr(rsp, wordSize);
9749
9750 bind(done);
9751}
9752
9753// Inflate byte[] array to char[].
9754// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9755// @HotSpotIntrinsicCandidate
9756// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9757// for (int i = 0; i < len; i++) {
9758// dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9759// }
9760// }
9761void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9762 XMMRegister tmp1, Register tmp2) {
9763 Label copy_chars_loop, done, below_threshold;
9764 // rsi: src
9765 // rdi: dst
9766 // rdx: len
9767 // rcx: tmp2
9768
9769 // rsi holds start addr of source byte[] to be inflated
9770 // rdi holds start addr of destination char[]
9771 // rdx holds length
9772 assert_different_registers(src, dst, len, tmp2);
9773
9774 if ((UseAVX > 2) && // AVX512
9775 VM_Version::supports_avx512vlbw() &&
9776 VM_Version::supports_bmi2()) {
9777
9778 Label copy_32_loop, copy_tail;
9779 Register tmp3_aliased = len;
9780
9781 // if length of the string is less than 16, handle it in an old fashioned way
9782 testl(len, -16);
9783 jcc(Assembler::zero, below_threshold);
9784
9785 // In order to use only one arithmetic operation for the main loop we use
9786 // this pre-calculation
9787 movl(tmp2, len);
9788 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9789 andl(len, -32); // vector count
9790 jccb(Assembler::zero, copy_tail);
9791
9792 lea(src, Address(src, len, Address::times_1));
9793 lea(dst, Address(dst, len, Address::times_2));
9794 negptr(len);
9795
9796
9797 // inflate 32 chars per iter
9798 bind(copy_32_loop);
9799 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9800 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9801 addptr(len, 32);
9802 jcc(Assembler::notZero, copy_32_loop);
9803
9804 bind(copy_tail);
9805 // bail out when there is nothing to be done
9806 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9807 jcc(Assembler::zero, done);
9808
9809 // ~(~0 << length), where length is the # of remaining elements to process
9810 movl(tmp3_aliased, -1);
9811 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9812 notl(tmp3_aliased);
9813 kmovdl(k2, tmp3_aliased);
9814 evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
9815 evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
9816
9817 jmp(done);
9818 }
9819 if (UseSSE42Intrinsics) {
9820 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9821
9822 movl(tmp2, len);
9823
9824 if (UseAVX > 1) {
9825 andl(tmp2, (16 - 1));
9826 andl(len, -16);
9827 jccb(Assembler::zero, copy_new_tail);
9828 } else {
9829 andl(tmp2, 0x00000007); // tail count (in chars)
9830 andl(len, 0xfffffff8); // vector count (in chars)
9831 jccb(Assembler::zero, copy_tail);
9832 }
9833
9834 // vectored inflation
9835 lea(src, Address(src, len, Address::times_1));
9836 lea(dst, Address(dst, len, Address::times_2));
9837 negptr(len);
9838
9839 if (UseAVX > 1) {
9840 bind(copy_16_loop);
9841 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9842 vmovdqu(Address(dst, len, Address::times_2), tmp1);
9843 addptr(len, 16);
9844 jcc(Assembler::notZero, copy_16_loop);
9845
9846 bind(below_threshold);
9847 bind(copy_new_tail);
9848 if ((UseAVX > 2) &&
9849 VM_Version::supports_avx512vlbw() &&
9850 VM_Version::supports_bmi2()) {
9851 movl(tmp2, len);
9852 } else {
9853 movl(len, tmp2);
9854 }
9855 andl(tmp2, 0x00000007);
9856 andl(len, 0xFFFFFFF8);
9857 jccb(Assembler::zero, copy_tail);
9858
9859 pmovzxbw(tmp1, Address(src, 0));
9860 movdqu(Address(dst, 0), tmp1);
9861 addptr(src, 8);
9862 addptr(dst, 2 * 8);
9863
9864 jmp(copy_tail, true);
9865 }
9866
9867 // inflate 8 chars per iter
9868 bind(copy_8_loop);
9869 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
9870 movdqu(Address(dst, len, Address::times_2), tmp1);
9871 addptr(len, 8);
9872 jcc(Assembler::notZero, copy_8_loop);
9873
9874 bind(copy_tail);
9875 movl(len, tmp2);
9876
9877 cmpl(len, 4);
9878 jccb(Assembler::less, copy_bytes);
9879
9880 movdl(tmp1, Address(src, 0)); // load 4 byte chars
9881 pmovzxbw(tmp1, tmp1);
9882 movq(Address(dst, 0), tmp1);
9883 subptr(len, 4);
9884 addptr(src, 4);
9885 addptr(dst, 8);
9886
9887 bind(copy_bytes);
9888 } else {
9889 bind(below_threshold);
9890 }
9891
9892 testl(len, len);
9893 jccb(Assembler::zero, done);
9894 lea(src, Address(src, len, Address::times_1));
9895 lea(dst, Address(dst, len, Address::times_2));
9896 negptr(len);
9897
9898 // inflate 1 char per iter
9899 bind(copy_chars_loop);
9900 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
9901 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
9902 increment(len);
9903 jcc(Assembler::notZero, copy_chars_loop);
9904
9905 bind(done);
9906}
9907
9908Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9909 switch (cond) {
9910 // Note some conditions are synonyms for others
9911 case Assembler::zero: return Assembler::notZero;
9912 case Assembler::notZero: return Assembler::zero;
9913 case Assembler::less: return Assembler::greaterEqual;
9914 case Assembler::lessEqual: return Assembler::greater;
9915 case Assembler::greater: return Assembler::lessEqual;
9916 case Assembler::greaterEqual: return Assembler::less;
9917 case Assembler::below: return Assembler::aboveEqual;
9918 case Assembler::belowEqual: return Assembler::above;
9919 case Assembler::above: return Assembler::belowEqual;
9920 case Assembler::aboveEqual: return Assembler::below;
9921 case Assembler::overflow: return Assembler::noOverflow;
9922 case Assembler::noOverflow: return Assembler::overflow;
9923 case Assembler::negative: return Assembler::positive;
9924 case Assembler::positive: return Assembler::negative;
9925 case Assembler::parity: return Assembler::noParity;
9926 case Assembler::noParity: return Assembler::parity;
9927 }
9928 ShouldNotReachHere(); return Assembler::overflow;
9929}
9930
9931SkipIfEqual::SkipIfEqual(
9932 MacroAssembler* masm, const bool* flag_addr, bool value) {
9933 _masm = masm;
9934 _masm->cmp8(ExternalAddress((address)flag_addr), value);
9935 _masm->jcc(Assembler::equal, _label);
9936}
9937
9938SkipIfEqual::~SkipIfEqual() {
9939 _masm->bind(_label);
9940}
9941
9942// 32-bit Windows has its own fast-path implementation
9943// of get_thread
9944#if !defined(WIN32) || defined(_LP64)
9945
9946// This is simply a call to Thread::current()
9947void MacroAssembler::get_thread(Register thread) {
9948 if (thread != rax) {
9949 push(rax);
9950 }
9951 LP64_ONLY(push(rdi);)
9952 LP64_ONLY(push(rsi);)
9953 push(rdx);
9954 push(rcx);
9955#ifdef _LP64
9956 push(r8);
9957 push(r9);
9958 push(r10);
9959 push(r11);
9960#endif
9961
9962 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9963
9964#ifdef _LP64
9965 pop(r11);
9966 pop(r10);
9967 pop(r9);
9968 pop(r8);
9969#endif
9970 pop(rcx);
9971 pop(rdx);
9972 LP64_ONLY(pop(rsi);)
9973 LP64_ONLY(pop(rdi);)
9974 if (thread != rax) {
9975 mov(thread, rax);
9976 pop(rax);
9977 }
9978}
9979
9980#endif
9981