sharedRuntime_x86_64.cpp source code [OpenJDK/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp]

1	/*
2	* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4	*
5	* This code is free software; you can redistribute it and/or modify it
6	* under the terms of the GNU General Public License version 2 only, as
7	* published by the Free Software Foundation.
8	*
9	* This code is distributed in the hope that it will be useful, but WITHOUT
10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12	* version 2 for more details (a copy is included in the LICENSE file that
13	* accompanied this code).
14	*
15	* You should have received a copy of the GNU General Public License version
16	* 2 along with this work; if not, write to the Free Software Foundation,
17	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18	*
19	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20	* or visit www.oracle.com if you need additional information or have any
21	* questions.
22	*
23	*/
24
25	#include "precompiled.hpp"
26	#ifndef _WINDOWS
27	#include "alloca.h"
28	#endif
29	#include "asm/macroAssembler.hpp"
30	#include "asm/macroAssembler.inline.hpp"
31	#include "code/debugInfoRec.hpp"
32	#include "code/icBuffer.hpp"
33	#include "code/nativeInst.hpp"
34	#include "code/vtableStubs.hpp"
35	#include "gc/shared/collectedHeap.hpp"
36	#include "gc/shared/gcLocker.hpp"
37	#include "gc/shared/barrierSet.hpp"
38	#include "gc/shared/barrierSetAssembler.hpp"
39	#include "interpreter/interpreter.hpp"
40	#include "logging/log.hpp"
41	#include "memory/resourceArea.hpp"
42	#include "memory/universe.hpp"
43	#include "oops/compiledICHolder.hpp"
44	#include "runtime/safepointMechanism.hpp"
45	#include "runtime/sharedRuntime.hpp"
46	#include "runtime/vframeArray.hpp"
47	#include "utilities/align.hpp"
48	#include "utilities/formatBuffer.hpp"
49	#include "vm_version_x86.hpp"
50	#include "vmreg_x86.inline.hpp"
51	#ifdef COMPILER1
52	#include "c1/c1_Runtime1.hpp"
53	#endif
54	#ifdef COMPILER2
55	#include "opto/runtime.hpp"
56	#endif
57	#if INCLUDE_JVMCI
58	#include "jvmci/jvmciJavaClasses.hpp"
59	#endif
60
61	#define __ masm->
62
63	const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
64
65	class SimpleRuntimeFrame {
66
67	public:
68
69	// Most of the runtime stubs have this simple frame layout.
70	// This class exists to make the layout shared in one place.
71	// Offsets are for compiler stack slots, which are jints.
72	enum layout {
73	// The frame sender code expects that rbp will be in the "natural" place and
74	// will override any oopMap setting for it. We must therefore force the layout
75	// so that it agrees with the frame sender code.
76	rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
77	rbp_off2,
78	return_off, return_off2,
79	framesize
80	};
81	};
82
83	class RegisterSaver {
84	// Capture info about frame layout. Layout offsets are in jint
85	// units because compiler frame slots are jints.
86	#define XSAVE_AREA_BEGIN 160
87	#define XSAVE_AREA_YMM_BEGIN 576
88	#define XSAVE_AREA_ZMM_BEGIN 1152
89	#define XSAVE_AREA_UPPERBANK 1664
90	#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
91	#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
92	#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
93	enum layout {
94	fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
95	xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
96	DEF_XMM_OFFS(`0`),
97	DEF_XMM_OFFS(`1`),
98	// 2..15 are implied in range usage
99	ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
100	DEF_YMM_OFFS(`0`),
101	DEF_YMM_OFFS(`1`),
102	// 2..15 are implied in range usage
103	zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
104	zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
105	DEF_ZMM_OFFS(`16`),
106	DEF_ZMM_OFFS(`17`),
107	// 18..31 are implied in range usage
108	fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-`1`)*wordSize / BytesPerInt),
109	fpu_stateH_end,
110	r15_off, r15H_off,
111	r14_off, r14H_off,
112	r13_off, r13H_off,
113	r12_off, r12H_off,
114	r11_off, r11H_off,
115	r10_off, r10H_off,
116	r9_off, r9H_off,
117	r8_off, r8H_off,
118	rdi_off, rdiH_off,
119	rsi_off, rsiH_off,
120	ignore_off, ignoreH_off, // extra copy of rbp
121	rsp_off, rspH_off,
122	rbx_off, rbxH_off,
123	rdx_off, rdxH_off,
124	rcx_off, rcxH_off,
125	rax_off, raxH_off,
126	// 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
127	align_off, alignH_off,
128	flags_off, flagsH_off,
129	// The frame sender code expects that rbp will be in the "natural" place and
130	// will override any oopMap setting for it. We must therefore force the layout
131	// so that it agrees with the frame sender code.
132	rbp_off, rbpH_off, // copy of rbp we will restore
133	return_off, returnH_off, // slot for return address
134	reg_save_size // size in compiler stack slots
135	};
136
137	public:
138	static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
139	static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
140
141	// Offsets into the register save area
142	// Used by deoptimization when it is managing result register
143	// values on its own
144
145	static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
146	static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
147	static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
148	static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
149	static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
150
151	// During deoptimization only the result registers need to be restored,
152	// all the other values have already been extracted.
153	static void restore_result_registers(MacroAssembler* masm);
154	};
155
156	OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
157	int off = `0`;
158	int num_xmm_regs = XMMRegisterImpl::number_of_registers;
159	if (UseAVX < `3`) {
160	num_xmm_regs = num_xmm_regs/`2`;
161	}
162	#if COMPILER2_OR_JVMCI
163	if (save_vectors) {
164	assert(UseAVX > `0`, "Vectors larger than 16 byte long are supported only with AVX");
165	assert(MaxVectorSize <= `64`, "Only up to 64 byte long vectors are supported");
166	}
167	#else
168	assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
169	#endif
170
171	// Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
172	int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
173	// OopMap frame size is in compiler stack slots (jint's) not bytes or words
174	int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
175	// CodeBlob frame size is in words.
176	int frame_size_in_words = frame_size_in_bytes / wordSize;
177	*total_frame_words = frame_size_in_words;
178
179	// Save registers, fpu state, and flags.
180	// We assume caller has already pushed the return address onto the
181	// stack, so rsp is 8-byte aligned here.
182	// We push rpb twice in this sequence because we want the real rbp
183	// to be under the return like a normal enter.
184
185	__ enter(); // rsp becomes 16-byte aligned here
186	__ push_CPU_state(); // Push a multiple of 16 bytes
187
188	// push cpu state handles this on EVEX enabled targets
189	if (save_vectors) {
190	// Save upper half of YMM registers(0..15)
191	int base_addr = XSAVE_AREA_YMM_BEGIN;
192	for (int n = `0`; n < `16`; n++) {
193	__ vextractf128_high(Address (rsp, base_addr+n*`16`), as_XMMRegister(n));
194	}
195	if (VM_Version::supports_evex()) {
196	// Save upper half of ZMM registers(0..15)
197	base_addr = XSAVE_AREA_ZMM_BEGIN;
198	for (int n = `0`; n < `16`; n++) {
199	__ vextractf64x4_high(Address (rsp, base_addr+n*`32`), as_XMMRegister(n));
200	}
201	// Save full ZMM registers(16..num_xmm_regs)
202	base_addr = XSAVE_AREA_UPPERBANK;
203	off = `0`;
204	int vector_len = Assembler::AVX_512bit;
205	for (int n = `16`; n < num_xmm_regs; n++) {
206	__ evmovdqul(Address (rsp, base_addr+(off++*`64`)), as_XMMRegister(n), vector_len);
207	}
208	}
209	} else {
210	if (VM_Version::supports_evex()) {
211	// Save upper bank of ZMM registers(16..31) for double/float usage
212	int base_addr = XSAVE_AREA_UPPERBANK;
213	off = `0`;
214	for (int n = `16`; n < num_xmm_regs; n++) {
215	__ movsd(Address (rsp, base_addr+(off++*`64`)), as_XMMRegister(n));
216	}
217	}
218	}
219	__ vzeroupper();
220	if (frame::arg_reg_save_area_bytes != `0`) {
221	// Allocate argument register save area
222	__ subptr(rsp, frame::arg_reg_save_area_bytes);
223	}
224
225	// Set an oopmap for the call site. This oopmap will map all
226	// oop-registers and debug-info registers as callee-saved. This
227	// will allow deoptimization at this safepoint to find all possible
228	// debug-info recordings, as well as let GC find all oops.
229
230	OopMapSet oop_maps = new* OopMapSet ();
231	OopMap* map = new OopMap (frame_size_in_slots, `0`);
232
233	#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
234
235	map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
236	map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
237	map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
238	map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
239	// rbp location is known implicitly by the frame sender code, needs no oopmap
240	// and the location where rbp was saved by is ignored
241	map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
242	map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
243	map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
244	map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
245	map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
246	map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
247	map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
248	map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
249	map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
250	map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
251	// For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
252	// on EVEX enabled targets, we get it included in the xsave area
253	off = xmm0_off;
254	int delta = xmm1_off - off;
255	for (int n = `0`; n < `16`; n++) {
256	XMMRegister xmm_name = as_XMMRegister(n);
257	map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
258	off += delta;
259	}
260	if(UseAVX > `2`) {
261	// Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
262	off = zmm16_off;
263	delta = zmm17_off - off;
264	for (int n = `16`; n < num_xmm_regs; n++) {
265	XMMRegister zmm_name = as_XMMRegister(n);
266	map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
267	off += delta;
268	}
269	}
270
271	#if COMPILER2_OR_JVMCI
272	if (save_vectors) {
273	off = ymm0_off;
274	int delta = ymm1_off - off;
275	for (int n = `0`; n < `16`; n++) {
276	XMMRegister ymm_name = as_XMMRegister(n);
277	map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(`4`));
278	off += delta;
279	}
280	}
281	#endif // COMPILER2_OR_JVMCI
282
283	// %%% These should all be a waste but we'll keep things as they were for now
284	if (true) {
285	map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
286	map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
287	map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
288	map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
289	// rbp location is known implicitly by the frame sender code, needs no oopmap
290	map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
291	map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
292	map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
293	map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
294	map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
295	map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
296	map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
297	map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
298	map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
299	map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
300	// For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
301	// on EVEX enabled targets, we get it included in the xsave area
302	off = xmm0H_off;
303	delta = xmm1H_off - off;
304	for (int n = `0`; n < `16`; n++) {
305	XMMRegister xmm_name = as_XMMRegister(n);
306	map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
307	off += delta;
308	}
309	if (UseAVX > `2`) {
310	// Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
311	off = zmm16H_off;
312	delta = zmm17H_off - off;
313	for (int n = `16`; n < num_xmm_regs; n++) {
314	XMMRegister zmm_name = as_XMMRegister(n);
315	map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
316	off += delta;
317	}
318	}
319	}
320
321	return map;
322	}
323
324	void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
325	int num_xmm_regs = XMMRegisterImpl::number_of_registers;
326	if (UseAVX < `3`) {
327	num_xmm_regs = num_xmm_regs/`2`;
328	}
329	if (frame::arg_reg_save_area_bytes != `0`) {
330	// Pop arg register save area
331	__ addptr(rsp, frame::arg_reg_save_area_bytes);
332	}
333
334	#if COMPILER2_OR_JVMCI
335	if (restore_vectors) {
336	assert(UseAVX > `0`, "Vectors larger than 16 byte long are supported only with AVX");
337	assert(MaxVectorSize <= `64`, "Only up to 64 byte long vectors are supported");
338	}
339	#else
340	assert(!restore_vectors, "vectors are generated only by C2");
341	#endif
342
343	__ vzeroupper();
344
345	// On EVEX enabled targets everything is handled in pop fpu state
346	if (restore_vectors) {
347	// Restore upper half of YMM registers (0..15)
348	int base_addr = XSAVE_AREA_YMM_BEGIN;
349	for (int n = `0`; n < `16`; n++) {
350	__ vinsertf128_high(as_XMMRegister(n), Address (rsp, base_addr+n*`16`));
351	}
352	if (VM_Version::supports_evex()) {
353	// Restore upper half of ZMM registers (0..15)
354	base_addr = XSAVE_AREA_ZMM_BEGIN;
355	for (int n = `0`; n < `16`; n++) {
356	__ vinsertf64x4_high(as_XMMRegister(n), Address (rsp, base_addr+n*`32`));
357	}
358	// Restore full ZMM registers(16..num_xmm_regs)
359	base_addr = XSAVE_AREA_UPPERBANK;
360	int vector_len = Assembler::AVX_512bit;
361	int off = `0`;
362	for (int n = `16`; n < num_xmm_regs; n++) {
363	__ evmovdqul(as_XMMRegister(n), Address (rsp, base_addr+(off++*`64`)), vector_len);
364	}
365	}
366	} else {
367	if (VM_Version::supports_evex()) {
368	// Restore upper bank of ZMM registers(16..31) for double/float usage
369	int base_addr = XSAVE_AREA_UPPERBANK;
370	int off = `0`;
371	for (int n = `16`; n < num_xmm_regs; n++) {
372	__ movsd(as_XMMRegister(n), Address (rsp, base_addr+(off++*`64`)));
373	}
374	}
375	}
376
377	// Recover CPU state
378	__ pop_CPU_state();
379	// Get the rbp described implicitly by the calling convention (no oopMap)
380	__ pop(rbp);
381	}
382
383	void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
384
385	// Just restore result register. Only used by deoptimization. By
386	// now any callee save register that needs to be restored to a c2
387	// caller of the deoptee has been extracted into the vframeArray
388	// and will be stuffed into the c2i adapter we create for later
389	// restoration so only result registers need to be restored here.
390
391	// Restore fp result register
392	__ movdbl(xmm0, Address (rsp, xmm0_offset_in_bytes()));
393	// Restore integer result register
394	__ movptr(rax, Address (rsp, rax_offset_in_bytes()));
395	__ movptr(rdx, Address (rsp, rdx_offset_in_bytes()));
396
397	// Pop all of the register save are off the stack except the return address
398	__ addptr(rsp, return_offset_in_bytes());
399	}
400
401	// Is vector's size (in bytes) bigger than a size saved by default?
402	// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
403	bool SharedRuntime::is_wide_vector(int size) {
404	return size > `16`;
405	}
406
407	size_t SharedRuntime::trampoline_size() {
408	return `16`;
409	}
410
411	void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
412	__ jump(RuntimeAddress (destination));
413	}
414
415	// The java_calling_convention describes stack locations as ideal slots on
416	// a frame with no abi restrictions. Since we must observe abi restrictions
417	// (like the placement of the register window) the slots must be biased by
418	// the following value.
419	static int reg2offset_in(VMReg r) {
420	// Account for saved rbp and return address
421	// This should really be in_preserve_stack_slots
422	return (r->reg2stack() + `4`) * VMRegImpl::stack_slot_size;
423	}
424
425	static int reg2offset_out(VMReg r) {
426	return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
427	}
428
429	// ---------------------------------------------------------------------------
430	// Read the array of BasicTypes from a signature, and compute where the
431	// arguments should go. Values in the VMRegPair regs array refer to 4-byte
432	// quantities. Values less than VMRegImpl::stack0 are registers, those above
433	// refer to 4-byte stack slots. All stack slots are based off of the stack pointer
434	// as framesizes are fixed.
435	// VMRegImpl::stack0 refers to the first slot 0(sp).
436	// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register
437	// up to RegisterImpl::number_of_registers) are the 64-bit
438	// integer registers.
439
440	// Note: the INPUTS in sig_bt are in units of Java argument words, which are
441	// either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
442	// units regardless of build. Of course for i486 there is no 64 bit build
443
444	// The Java calling convention is a "shifted" version of the C ABI.
445	// By skipping the first C ABI register we can call non-static jni methods
446	// with small numbers of arguments without having to shuffle the arguments
447	// at all. Since we control the java ABI we ought to at least get some
448	// advantage out of it.
449
450	int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
451	VMRegPair *regs,
452	int total_args_passed,
453	int is_outgoing) {
454
455	// Create the mapping between argument positions and
456	// registers.
457	static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
458	j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
459	};
460	static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
461	j_farg0, j_farg1, j_farg2, j_farg3,
462	j_farg4, j_farg5, j_farg6, j_farg7
463	};
464
465
466	uint int_args = `0`;
467	uint fp_args = `0`;
468	uint stk_args = `0`; // inc by 2 each time
469
470	for (int i = `0`; i < total_args_passed; i++) {
471	switch (sig_bt[i]) {
472	case T_BOOLEAN:
473	case T_CHAR:
474	case T_BYTE:
475	case T_SHORT:
476	case T_INT:
477	if (int_args < Argument::n_int_register_parameters_j) {
478	regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
479	} else {
480	regs[i].set1(VMRegImpl::stack2reg(stk_args));
481	stk_args += `2`;
482	}
483	break;
484	case T_VOID:
485	// halves of T_LONG or T_DOUBLE
486	assert(i != `0` && (sig_bt[i - `1`] == T_LONG \|\| sig_bt[i - `1`] == T_DOUBLE), "expecting half");
487	regs[i].set_bad();
488	break;
489	case T_LONG:
490	assert((i + `1`) < total_args_passed && sig_bt[i + `1`] == T_VOID, "expecting half");
491	// fall through
492	case T_OBJECT:
493	case T_ARRAY:
494	case T_ADDRESS:
495	if (int_args < Argument::n_int_register_parameters_j) {
496	regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
497	} else {
498	regs[i].set2(VMRegImpl::stack2reg(stk_args));
499	stk_args += `2`;
500	}
501	break;
502	case T_FLOAT:
503	if (fp_args < Argument::n_float_register_parameters_j) {
504	regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
505	} else {
506	regs[i].set1(VMRegImpl::stack2reg(stk_args));
507	stk_args += `2`;
508	}
509	break;
510	case T_DOUBLE:
511	assert((i + `1`) < total_args_passed && sig_bt[i + `1`] == T_VOID, "expecting half");
512	if (fp_args < Argument::n_float_register_parameters_j) {
513	regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
514	} else {
515	regs[i].set2(VMRegImpl::stack2reg(stk_args));
516	stk_args += `2`;
517	}
518	break;
519	default:
520	ShouldNotReachHere();
521	break;
522	}
523	}
524
525	return align_up(stk_args, `2`);
526	}
527
528	// Patch the callers callsite with entry to compiled code if it exists.
529	static void patch_callers_callsite(MacroAssembler *masm) {
530	Label L;
531	__ cmpptr(Address (rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
532	__ jcc(Assembler::equal, L);
533
534	// Save the current stack pointer
535	__ mov(r13, rsp);
536	// Schedule the branch target address early.
537	// Call into the VM to patch the caller, then jump to compiled callee
538	// rax isn't live so capture return address while we easily can
539	__ movptr(rax, Address (rsp, `0`));
540
541	// align stack so push_CPU_state doesn't fault
542	__ andptr(rsp, -(StackAlignmentInBytes));
543	__ push_CPU_state();
544	__ vzeroupper();
545	// VM needs caller's callsite
546	// VM needs target method
547	// This needs to be a long call since we will relocate this adapter to
548	// the codeBuffer and it may not reach
549
550	// Allocate argument register save area
551	if (frame::arg_reg_save_area_bytes != `0`) {
552	__ subptr(rsp, frame::arg_reg_save_area_bytes);
553	}
554	__ mov(c_rarg0, rbx);
555	__ mov(c_rarg1, rax);
556	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
557
558	// De-allocate argument register save area
559	if (frame::arg_reg_save_area_bytes != `0`) {
560	__ addptr(rsp, frame::arg_reg_save_area_bytes);
561	}
562
563	__ vzeroupper();
564	__ pop_CPU_state();
565	// restore sp
566	__ mov(rsp, r13);
567	__ bind(L);
568	}
569
570
571	static void gen_c2i_adapter(MacroAssembler *masm,
572	int total_args_passed,
573	int comp_args_on_stack,
574	const BasicType *sig_bt,
575	const VMRegPair *regs,
576	Label& skip_fixup) {
577	// Before we get into the guts of the C2I adapter, see if we should be here
578	// at all. We've come from compiled code and are attempting to jump to the
579	// interpreter, which means the caller made a static call to get here
580	// (vcalls always get a compiled target if there is one). Check for a
581	// compiled target. If there is one, we need to patch the caller's call.
582	patch_callers_callsite(masm);
583
584	__ bind(skip_fixup);
585
586	// Since all args are passed on the stack, total_args_passed *
587	// Interpreter::stackElementSize is the space we need. Plus 1 because
588	// we also account for the return address location since
589	// we store it first rather than hold it in rax across all the shuffling
590
591	int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
592
593	// stack is aligned, keep it that way
594	extraspace = align_up(extraspace, `2`*wordSize);
595
596	// Get return address
597	__ pop(rax);
598
599	// set senderSP value
600	__ mov(r13, rsp);
601
602	__ subptr(rsp, extraspace);
603
604	// Store the return address in the expected location
605	__ movptr(Address (rsp, `0`), rax);
606
607	// Now write the args into the outgoing interpreter space
608	for (int i = `0`; i < total_args_passed; i++) {
609	if (sig_bt[i] == T_VOID) {
610	assert(i > `0` && (sig_bt[i-`1`] == T_LONG \|\| sig_bt[i-`1`] == T_DOUBLE), "missing half");
611	continue;
612	}
613
614	// offset to start parameters
615	int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
616	int next_off = st_off - Interpreter::stackElementSize;
617
618	// Say 4 args:
619	// i st_off
620	// 0 32 T_LONG
621	// 1 24 T_VOID
622	// 2 16 T_OBJECT
623	// 3 8 T_BOOL
624	// - 0 return address
625	//
626	// However to make thing extra confusing. Because we can fit a long/double in
627	// a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
628	// leaves one slot empty and only stores to a single slot. In this case the
629	// slot that is occupied is the T_VOID slot. See I said it was confusing.
630
631	VMReg r_1 = regs[i].first();
632	VMReg r_2 = regs[i].second();
633	if (!r_1->is_valid()) {
634	assert(!r_2->is_valid(), "");
635	continue;
636	}
637	if (r_1->is_stack()) {
638	// memory to memory use rax
639	int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
640	if (!r_2->is_valid()) {
641	// sign extend??
642	__ movl(rax, Address (rsp, ld_off));
643	__ movptr(Address (rsp, st_off), rax);
644
645	} else {
646
647	__ movq(rax, Address (rsp, ld_off));
648
649	// Two VMREgs\|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
650	// T_DOUBLE and T_LONG use two slots in the interpreter
651	if ( sig_bt[i] == T_LONG \|\| sig_bt[i] == T_DOUBLE) {
652	// ld_off == LSW, ld_off+wordSize == MSW
653	// st_off == MSW, next_off == LSW
654	__ movq(Address (rsp, next_off), rax);
655	#ifdef ASSERT
656	// Overwrite the unused slot with known junk
657	__ mov64(rax, CONST64(`0xdeadffffdeadaaaa`));
658	__ movptr(Address(rsp, st_off), rax);
659	#endif /* ASSERT */
660	} else {
661	__ movq(Address (rsp, st_off), rax);
662	}
663	}
664	} else if (r_1->is_Register()) {
665	Register r = r_1->as_Register();
666	if (!r_2->is_valid()) {
667	// must be only an int (or less ) so move only 32bits to slot
668	// why not sign extend??
669	__ movl(Address (rsp, st_off), r);
670	} else {
671	// Two VMREgs\|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
672	// T_DOUBLE and T_LONG use two slots in the interpreter
673	if ( sig_bt[i] == T_LONG \|\| sig_bt[i] == T_DOUBLE) {
674	// long/double in gpr
675	#ifdef ASSERT
676	// Overwrite the unused slot with known junk
677	__ mov64(rax, CONST64(`0xdeadffffdeadaaab`));
678	__ movptr(Address(rsp, st_off), rax);
679	#endif /* ASSERT */
680	__ movq(Address (rsp, next_off), r);
681	} else {
682	__ movptr(Address (rsp, st_off), r);
683	}
684	}
685	} else {
686	assert(r_1->is_XMMRegister(), "");
687	if (!r_2->is_valid()) {
688	// only a float use just part of the slot
689	__ movflt(Address (rsp, st_off), r_1->as_XMMRegister());
690	} else {
691	#ifdef ASSERT
692	// Overwrite the unused slot with known junk
693	__ mov64(rax, CONST64(`0xdeadffffdeadaaac`));
694	__ movptr(Address(rsp, st_off), rax);
695	#endif /* ASSERT */
696	__ movdbl(Address (rsp, next_off), r_1->as_XMMRegister());
697	}
698	}
699	}
700
701	// Schedule the branch target address early.
702	__ movptr(rcx, Address (rbx, in_bytes(Method::interpreter_entry_offset())));
703	__ jmp(rcx);
704	}
705
706	static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
707	address code_start, address code_end,
708	Label& L_ok) {
709	Label L_fail;
710	__ lea(temp_reg, ExternalAddress (code_start));
711	__ cmpptr(pc_reg, temp_reg);
712	__ jcc(Assembler::belowEqual, L_fail);
713	__ lea(temp_reg, ExternalAddress (code_end));
714	__ cmpptr(pc_reg, temp_reg);
715	__ jcc(Assembler::below, L_ok);
716	__ bind(L_fail);
717	}
718
719	void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
720	int total_args_passed,
721	int comp_args_on_stack,
722	const BasicType *sig_bt,
723	const VMRegPair *regs) {
724
725	// Note: r13 contains the senderSP on entry. We must preserve it since
726	// we may do a i2c -> c2i transition if we lose a race where compiled
727	// code goes non-entrant while we get args ready.
728	// In addition we use r13 to locate all the interpreter args as
729	// we must align the stack to 16 bytes on an i2c entry else we
730	// lose alignment we expect in all compiled code and register
731	// save code can segv when fxsave instructions find improperly
732	// aligned stack pointer.
733
734	// Adapters can be frameless because they do not require the caller
735	// to perform additional cleanup work, such as correcting the stack pointer.
736	// An i2c adapter is frameless because the caller* frame, which is interpreted,*
737	// routinely repairs its own stack pointer (from interpreter_frame_last_sp),
738	// even if a callee has modified the stack pointer.
739	// A c2i adapter is frameless because the callee* frame, which is interpreted,*
740	// routinely repairs its caller's stack pointer (from sender_sp, which is set
741	// up via the senderSP register).
742	// In other words, if either* the caller or callee is interpreted, we can*
743	// get the stack pointer repaired after a call.
744	// This is why c2i and i2c adapters cannot be indefinitely composed.
745	// In particular, if a c2i adapter were to somehow call an i2c adapter,
746	// both caller and callee would be compiled methods, and neither would
747	// clean up the stack pointer changes performed by the two adapters.
748	// If this happens, control eventually transfers back to the compiled
749	// caller, but with an uncorrected stack, causing delayed havoc.
750
751	// Pick up the return address
752	__ movptr(rax, Address (rsp, `0`));
753
754	if (VerifyAdapterCalls &&
755	(Interpreter::code() != NULL \|\| StubRoutines::code1() != NULL)) {
756	// So, let's test for cascading c2i/i2c adapters right now.
757	// assert(Interpreter::contains($return_addr) \|\|
758	// StubRoutines::contains($return_addr),
759	// "i2c adapter must return to an interpreter frame");
760	__ block_comment("verify_i2c { ");
761	Label L_ok;
762	if (Interpreter::code() != NULL)
763	range_check(masm, rax, r11,
764	Interpreter::code()->code_start(), Interpreter::code()->code_end(),
765	L_ok);
766	if (StubRoutines::code1() != NULL)
767	range_check(masm, rax, r11,
768	StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
769	L_ok);
770	if (StubRoutines::code2() != NULL)
771	range_check(masm, rax, r11,
772	StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
773	L_ok);
774	const char* msg = "i2c adapter must return to an interpreter frame";
775	__ block_comment(msg);
776	__ stop(msg);
777	__ bind(L_ok);
778	__ block_comment("} verify_i2ce ");
779	}
780
781	// Must preserve original SP for loading incoming arguments because
782	// we need to align the outgoing SP for compiled code.
783	__ movptr(r11, rsp);
784
785	// Cut-out for having no stack args. Since up to 2 int/oop args are passed
786	// in registers, we will occasionally have no stack args.
787	int comp_words_on_stack = `0`;
788	if (comp_args_on_stack) {
789	// Sig words on the stack are greater-than VMRegImpl::stack0. Those in
790	// registers are below. By subtracting stack0, we either get a negative
791	// number (all values in registers) or the maximum stack slot accessed.
792
793	// Convert 4-byte c2 stack slots to words.
794	comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
795	// Round up to miminum stack alignment, in wordSize
796	comp_words_on_stack = align_up(comp_words_on_stack, `2`);
797	__ subptr(rsp, comp_words_on_stack * wordSize);
798	}
799
800
801	// Ensure compiled code always sees stack at proper alignment
802	__ andptr(rsp, -`16`);
803
804	// push the return address and misalign the stack that youngest frame always sees
805	// as far as the placement of the call instruction
806	__ push(rax);
807
808	// Put saved SP in another register
809	const Register saved_sp = rax;
810	__ movptr(saved_sp, r11);
811
812	// Will jump to the compiled code just as if compiled code was doing it.
813	// Pre-load the register-jump target early, to schedule it better.
814	__ movptr(r11, Address (rbx, in_bytes(Method::from_compiled_offset())));
815
816	#if INCLUDE_JVMCI
817	if (EnableJVMCI \|\| UseAOT) {
818	// check if this call should be routed towards a specific entry point
819	__ cmpptr(Address (r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), `0`);
820	Label no_alternative_target;
821	__ jcc(Assembler::equal, no_alternative_target);
822	__ movptr(r11, Address (r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
823	__ movptr(Address (r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), `0`);
824	__ bind(no_alternative_target);
825	}
826	#endif // INCLUDE_JVMCI
827
828	// Now generate the shuffle code. Pick up all register args and move the
829	// rest through the floating point stack top.
830	for (int i = `0`; i < total_args_passed; i++) {
831	if (sig_bt[i] == T_VOID) {
832	// Longs and doubles are passed in native word order, but misaligned
833	// in the 32-bit build.
834	assert(i > `0` && (sig_bt[i-`1`] == T_LONG \|\| sig_bt[i-`1`] == T_DOUBLE), "missing half");
835	continue;
836	}
837
838	// Pick up 0, 1 or 2 words from SP+offset.
839
840	assert(!regs[i].second()->is_valid() \|\| regs[i].first()->next() == regs[i].second(),
841	"scrambled load targets?");
842	// Load in argument order going down.
843	int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
844	// Point to interpreter value (vs. tag)
845	int next_off = ld_off - Interpreter::stackElementSize;
846	//
847	//
848	//
849	VMReg r_1 = regs[i].first();
850	VMReg r_2 = regs[i].second();
851	if (!r_1->is_valid()) {
852	assert(!r_2->is_valid(), "");
853	continue;
854	}
855	if (r_1->is_stack()) {
856	// Convert stack slot to an SP offset (+ wordSize to account for return address )
857	int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
858
859	// We can use r13 as a temp here because compiled code doesn't need r13 as an input
860	// and if we end up going thru a c2i because of a miss a reasonable value of r13
861	// will be generated.
862	if (!r_2->is_valid()) {
863	// sign extend???
864	__ movl(r13, Address (saved_sp, ld_off));
865	__ movptr(Address (rsp, st_off), r13);
866	} else {
867	//
868	// We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
869	// the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
870	// So we must adjust where to pick up the data to match the interpreter.
871	//
872	// Interpreter local[n] == MSW, local[n+1] == LSW however locals
873	// are accessed as negative so LSW is at LOW address
874
875	// ld_off is MSW so get LSW
876	const int offset = (sig_bt[i]==T_LONG\|\|sig_bt[i]==T_DOUBLE)?
877	next_off : ld_off;
878	__ movq(r13, Address (saved_sp, offset));
879	// st_off is LSW (i.e. reg.first())
880	__ movq(Address (rsp, st_off), r13);
881	}
882	} else if (r_1->is_Register()) { // Register argument
883	Register r = r_1->as_Register();
884	assert(r != rax, "must be different");
885	if (r_2->is_valid()) {
886	//
887	// We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
888	// the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
889	// So we must adjust where to pick up the data to match the interpreter.
890
891	const int offset = (sig_bt[i]==T_LONG\|\|sig_bt[i]==T_DOUBLE)?
892	next_off : ld_off;
893
894	// this can be a misaligned move
895	__ movq(r, Address (saved_sp, offset));
896	} else {
897	// sign extend and use a full word?
898	__ movl(r, Address (saved_sp, ld_off));
899	}
900	} else {
901	if (!r_2->is_valid()) {
902	__ movflt(r_1->as_XMMRegister(), Address (saved_sp, ld_off));
903	} else {
904	__ movdbl(r_1->as_XMMRegister(), Address (saved_sp, next_off));
905	}
906	}
907	}
908
909	// 6243940 We might end up in handle_wrong_method if
910	// the callee is deoptimized as we race thru here. If that
911	// happens we don't want to take a safepoint because the
912	// caller frame will look interpreted and arguments are now
913	// "compiled" so it is much better to make this transition
914	// invisible to the stack walking code. Unfortunately if
915	// we try and find the callee by normal means a safepoint
916	// is possible. So we stash the desired callee in the thread
917	// and the vm will find there should this case occur.
918
919	__ movptr(Address (r15_thread, JavaThread::callee_target_offset()), rbx);
920
921	// put Method where a c2i would expect should we end up there*
922	// only needed becaus eof c2 resolve stubs return Method as a result in*
923	// rax
924	__ mov(rax, rbx);
925	__ jmp(r11);
926	}
927
928	// ---------------------------------------------------------------
929	AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
930	int total_args_passed,
931	int comp_args_on_stack,
932	const BasicType *sig_bt,
933	const VMRegPair *regs,
934	AdapterFingerPrint* fingerprint) {
935	address i2c_entry = __ pc();
936
937	gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
938
939	// -------------------------------------------------------------------------
940	// Generate a C2I adapter. On entry we know rbx holds the Method during calls*
941	// to the interpreter. The args start out packed in the compiled layout. They
942	// need to be unpacked into the interpreter layout. This will almost always
943	// require some stack space. We grow the current (compiled) stack, then repack
944	// the args. We finally end in a jump to the generic interpreter entry point.
945	// On exit from the interpreter, the interpreter will restore our SP (lest the
946	// compiled code, which relys solely on SP and not RBP, get sick).
947
948	address c2i_unverified_entry = __ pc();
949	Label skip_fixup;
950	Label ok;
951
952	Register holder = rax;
953	Register receiver = j_rarg0;
954	Register temp = rbx;
955
956	{
957	__ load_klass(temp, receiver);
958	__ cmpptr(temp, Address (holder, CompiledICHolder::holder_klass_offset()));
959	__ movptr(rbx, Address (holder, CompiledICHolder::holder_metadata_offset()));
960	__ jcc(Assembler::equal, ok);
961	__ jump(RuntimeAddress (SharedRuntime::get_ic_miss_stub()));
962
963	__ bind(ok);
964	// Method might have been compiled since the call site was patched to
965	// interpreted if that is the case treat it as a miss so we can get
966	// the call site corrected.
967	__ cmpptr(Address (rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
968	__ jcc(Assembler::equal, skip_fixup);
969	__ jump(RuntimeAddress (SharedRuntime::get_ic_miss_stub()));
970	}
971
972	address c2i_entry = __ pc();
973
974	BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
975	bs->c2i_entry_barrier(masm);
976
977	// Class initialization barrier for static methods
978	if (VM_Version::supports_fast_class_init_checks()) {
979	Label L_skip_barrier;
980	Register method = rbx;
981
982	{ // Bypass the barrier for non-static methods
983	Register flags = rscratch1;
984	__ movl(flags, Address (method, Method::access_flags_offset()));
985	__ testl(flags, JVM_ACC_STATIC);
986	__ jcc(Assembler::zero, L_skip_barrier); // non-static
987	}
988
989	Register klass = rscratch1;
990	__ load_method_holder(klass, method);
991	__ clinit_barrier(klass, r15_thread, &L_skip_barrier /L_fast_path/);
992
993	__ jump(RuntimeAddress (SharedRuntime::get_handle_wrong_method_stub())); // slow path
994
995	__ bind(L_skip_barrier);
996	}
997
998	gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
999
1000	__ flush();
1001	return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
1002	}
1003
1004	int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1005	VMRegPair *regs,
1006	VMRegPair *regs2,
1007	int total_args_passed) {
1008	assert(regs2 == NULL, "not needed on x86");
1009	// We return the amount of VMRegImpl stack slots we need to reserve for all
1010	// the arguments NOT counting out_preserve_stack_slots.
1011
1012	// NOTE: These arrays will have to change when c1 is ported
1013	#ifdef _WIN64
1014	static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1015	c_rarg0, c_rarg1, c_rarg2, c_rarg3
1016	};
1017	static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1018	c_farg0, c_farg1, c_farg2, c_farg3
1019	};
1020	#else
1021	static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1022	c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1023	};
1024	static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1025	c_farg0, c_farg1, c_farg2, c_farg3,
1026	c_farg4, c_farg5, c_farg6, c_farg7
1027	};
1028	#endif // _WIN64
1029
1030
1031	uint int_args = `0`;
1032	uint fp_args = `0`;
1033	uint stk_args = `0`; // inc by 2 each time
1034
1035	for (int i = `0`; i < total_args_passed; i++) {
1036	switch (sig_bt[i]) {
1037	case T_BOOLEAN:
1038	case T_CHAR:
1039	case T_BYTE:
1040	case T_SHORT:
1041	case T_INT:
1042	if (int_args < Argument::n_int_register_parameters_c) {
1043	regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1044	#ifdef _WIN64
1045	fp_args++;
1046	// Allocate slots for callee to stuff register args the stack.
1047	stk_args += `2`;
1048	#endif
1049	} else {
1050	regs[i].set1(VMRegImpl::stack2reg(stk_args));
1051	stk_args += `2`;
1052	}
1053	break;
1054	case T_LONG:
1055	assert((i + `1`) < total_args_passed && sig_bt[i + `1`] == T_VOID, "expecting half");
1056	// fall through
1057	case T_OBJECT:
1058	case T_ARRAY:
1059	case T_ADDRESS:
1060	case T_METADATA:
1061	if (int_args < Argument::n_int_register_parameters_c) {
1062	regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1063	#ifdef _WIN64
1064	fp_args++;
1065	stk_args += `2`;
1066	#endif
1067	} else {
1068	regs[i].set2(VMRegImpl::stack2reg(stk_args));
1069	stk_args += `2`;
1070	}
1071	break;
1072	case T_FLOAT:
1073	if (fp_args < Argument::n_float_register_parameters_c) {
1074	regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1075	#ifdef _WIN64
1076	int_args++;
1077	// Allocate slots for callee to stuff register args the stack.
1078	stk_args += `2`;
1079	#endif
1080	} else {
1081	regs[i].set1(VMRegImpl::stack2reg(stk_args));
1082	stk_args += `2`;
1083	}
1084	break;
1085	case T_DOUBLE:
1086	assert((i + `1`) < total_args_passed && sig_bt[i + `1`] == T_VOID, "expecting half");
1087	if (fp_args < Argument::n_float_register_parameters_c) {
1088	regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1089	#ifdef _WIN64
1090	int_args++;
1091	// Allocate slots for callee to stuff register args the stack.
1092	stk_args += `2`;
1093	#endif
1094	} else {
1095	regs[i].set2(VMRegImpl::stack2reg(stk_args));
1096	stk_args += `2`;
1097	}
1098	break;
1099	case T_VOID: // Halves of longs and doubles
1100	assert(i != `0` && (sig_bt[i - `1`] == T_LONG \|\| sig_bt[i - `1`] == T_DOUBLE), "expecting half");
1101	regs[i].set_bad();
1102	break;
1103	default:
1104	ShouldNotReachHere();
1105	break;
1106	}
1107	}
1108	#ifdef _WIN64
1109	// windows abi requires that we always allocate enough stack space
1110	// for 4 64bit registers to be stored down.
1111	if (stk_args < `8`) {
1112	stk_args = `8`;
1113	}
1114	#endif // _WIN64
1115
1116	return stk_args;
1117	}
1118
1119	// On 64 bit we will store integer like items to the stack as
1120	// 64 bits items (sparc abi) even though java would only store
1121	// 32bits for a parameter. On 32bit it will simply be 32 bits
1122	// So this routine will do 32->32 on 32bit and 32->64 on 64bit
1123	static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1124	if (src.first()->is_stack()) {
1125	if (dst.first()->is_stack()) {
1126	// stack to stack
1127	__ movslq(rax, Address (rbp, reg2offset_in(src.first())));
1128	__ movq(Address (rsp, reg2offset_out(dst.first())), rax);
1129	} else {
1130	// stack to reg
1131	__ movslq(dst.first()->as_Register(), Address (rbp, reg2offset_in(src.first())));
1132	}
1133	} else if (dst.first()->is_stack()) {
1134	// reg to stack
1135	// Do we really have to sign extend???
1136	// __ movslq(src.first()->as_Register(), src.first()->as_Register());
1137	__ movq(Address (rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1138	} else {
1139	// Do we really have to sign extend???
1140	// __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1141	if (dst.first() != src.first()) {
1142	__ movq(dst.first()->as_Register(), src.first()->as_Register());
1143	}
1144	}
1145	}
1146
1147	static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1148	if (src.first()->is_stack()) {
1149	if (dst.first()->is_stack()) {
1150	// stack to stack
1151	__ movq(rax, Address (rbp, reg2offset_in(src.first())));
1152	__ movq(Address (rsp, reg2offset_out(dst.first())), rax);
1153	} else {
1154	// stack to reg
1155	__ movq(dst.first()->as_Register(), Address (rbp, reg2offset_in(src.first())));
1156	}
1157	} else if (dst.first()->is_stack()) {
1158	// reg to stack
1159	__ movq(Address (rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1160	} else {
1161	if (dst.first() != src.first()) {
1162	__ movq(dst.first()->as_Register(), src.first()->as_Register());
1163	}
1164	}
1165	}
1166
1167	// An oop arg. Must pass a handle not the oop itself
1168	static void object_move(MacroAssembler* masm,
1169	OopMap* map,
1170	int oop_handle_offset,
1171	int framesize_in_slots,
1172	VMRegPair src,
1173	VMRegPair dst,
1174	bool is_receiver,
1175	int* receiver_offset) {
1176
1177	// must pass a handle. First figure out the location we use as a handle
1178
1179	Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1180
1181	// See if oop is NULL if it is we need no handle
1182
1183	if (src.first()->is_stack()) {
1184
1185	// Oop is already on the stack as an argument
1186	int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1187	map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1188	if (is_receiver) {
1189	receiver_offset = (offset_in_older_frame + framesize_in_slots) VMRegImpl::stack_slot_size;
1190	}
1191
1192	__ cmpptr(Address (rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1193	__ lea(rHandle, Address (rbp, reg2offset_in(src.first())));
1194	// conditionally move a NULL
1195	__ cmovptr(Assembler::equal, rHandle, Address (rbp, reg2offset_in(src.first())));
1196	} else {
1197
1198	// Oop is in an a register we must store it to the space we reserve
1199	// on the stack for oop_handles and pass a handle if oop is non-NULL
1200
1201	const Register rOop = src.first()->as_Register();
1202	int oop_slot;
1203	if (rOop == j_rarg0)
1204	oop_slot = `0`;
1205	else if (rOop == j_rarg1)
1206	oop_slot = `1`;
1207	else if (rOop == j_rarg2)
1208	oop_slot = `2`;
1209	else if (rOop == j_rarg3)
1210	oop_slot = `3`;
1211	else if (rOop == j_rarg4)
1212	oop_slot = `4`;
1213	else {
1214	assert(rOop == j_rarg5, "wrong register");
1215	oop_slot = `5`;
1216	}
1217
1218	oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1219	int offset = oop_slot*VMRegImpl::stack_slot_size;
1220
1221	map->set_oop(VMRegImpl::stack2reg(oop_slot));
1222	// Store oop in handle area, may be NULL
1223	__ movptr(Address (rsp, offset), rOop);
1224	if (is_receiver) {
1225	*receiver_offset = offset;
1226	}
1227
1228	__ cmpptr(rOop, (int32_t)NULL_WORD);
1229	__ lea(rHandle, Address (rsp, offset));
1230	// conditionally move a NULL from the handle area where it was just stored
1231	__ cmovptr(Assembler::equal, rHandle, Address (rsp, offset));
1232	}
1233
1234	// If arg is on the stack then place it otherwise it is already in correct reg.
1235	if (dst.first()->is_stack()) {
1236	__ movptr(Address (rsp, reg2offset_out(dst.first())), rHandle);
1237	}
1238	}
1239
1240	// A float arg may have to do float reg int reg conversion
1241	static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1242	assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
1243
1244	// The calling conventions assures us that each VMregpair is either
1245	// all really one physical register or adjacent stack slots.
1246	// This greatly simplifies the cases here compared to sparc.
1247
1248	if (src.first()->is_stack()) {
1249	if (dst.first()->is_stack()) {
1250	__ movl(rax, Address (rbp, reg2offset_in(src.first())));
1251	__ movptr(Address (rsp, reg2offset_out(dst.first())), rax);
1252	} else {
1253	// stack to reg
1254	assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1255	__ movflt(dst.first()->as_XMMRegister(), Address (rbp, reg2offset_in(src.first())));
1256	}
1257	} else if (dst.first()->is_stack()) {
1258	// reg to stack
1259	assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1260	__ movflt(Address (rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1261	} else {
1262	// reg to reg
1263	// In theory these overlap but the ordering is such that this is likely a nop
1264	if ( src.first() != dst.first()) {
1265	__ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1266	}
1267	}
1268	}
1269
1270	// A long move
1271	static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1272
1273	// The calling conventions assures us that each VMregpair is either
1274	// all really one physical register or adjacent stack slots.
1275	// This greatly simplifies the cases here compared to sparc.
1276
1277	if (src.is_single_phys_reg() ) {
1278	if (dst.is_single_phys_reg()) {
1279	if (dst.first() != src.first()) {
1280	__ mov(dst.first()->as_Register(), src.first()->as_Register());
1281	}
1282	} else {
1283	assert(dst.is_single_reg(), "not a stack pair");
1284	__ movq(Address (rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1285	}
1286	} else if (dst.is_single_phys_reg()) {
1287	assert(src.is_single_reg(), "not a stack pair");
1288	__ movq(dst.first()->as_Register(), Address (rbp, reg2offset_out(src.first())));
1289	} else {
1290	assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1291	__ movq(rax, Address (rbp, reg2offset_in(src.first())));
1292	__ movq(Address (rsp, reg2offset_out(dst.first())), rax);
1293	}
1294	}
1295
1296	// A double move
1297	static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
1298
1299	// The calling conventions assures us that each VMregpair is either
1300	// all really one physical register or adjacent stack slots.
1301	// This greatly simplifies the cases here compared to sparc.
1302
1303	if (src.is_single_phys_reg() ) {
1304	if (dst.is_single_phys_reg()) {
1305	// In theory these overlap but the ordering is such that this is likely a nop
1306	if ( src.first() != dst.first()) {
1307	__ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1308	}
1309	} else {
1310	assert(dst.is_single_reg(), "not a stack pair");
1311	__ movdbl(Address (rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1312	}
1313	} else if (dst.is_single_phys_reg()) {
1314	assert(src.is_single_reg(), "not a stack pair");
1315	__ movdbl(dst.first()->as_XMMRegister(), Address (rbp, reg2offset_out(src.first())));
1316	} else {
1317	assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
1318	__ movq(rax, Address (rbp, reg2offset_in(src.first())));
1319	__ movq(Address (rsp, reg2offset_out(dst.first())), rax);
1320	}
1321	}
1322
1323
1324	void SharedRuntime::save_native_result(MacroAssembler masm, BasicType ret_type, int* frame_slots) {
1325	// We always ignore the frame_slots arg and just use the space just below frame pointer
1326	// which by this time is free to use
1327	switch (ret_type) {
1328	case T_FLOAT:
1329	__ movflt(Address (rbp, -wordSize), xmm0);
1330	break;
1331	case T_DOUBLE:
1332	__ movdbl(Address (rbp, -wordSize), xmm0);
1333	break;
1334	case T_VOID: break;
1335	default: {
1336	__ movptr(Address (rbp, -wordSize), rax);
1337	}
1338	}
1339	}
1340
1341	void SharedRuntime::restore_native_result(MacroAssembler masm, BasicType ret_type, int* frame_slots) {
1342	// We always ignore the frame_slots arg and just use the space just below frame pointer
1343	// which by this time is free to use
1344	switch (ret_type) {
1345	case T_FLOAT:
1346	__ movflt(xmm0, Address (rbp, -wordSize));
1347	break;
1348	case T_DOUBLE:
1349	__ movdbl(xmm0, Address (rbp, -wordSize));
1350	break;
1351	case T_VOID: break;
1352	default: {
1353	__ movptr(rax, Address (rbp, -wordSize));
1354	}
1355	}
1356	}
1357
1358	static void save_args(MacroAssembler masm, int* arg_count, int first_arg, VMRegPair *args) {
1359	for ( int i = first_arg ; i < arg_count ; i++ ) {
1360	if (args[i].first()->is_Register()) {
1361	__ push(args[i].first()->as_Register());
1362	} else if (args[i].first()->is_XMMRegister()) {
1363	__ subptr(rsp, `2`*wordSize);
1364	__ movdbl(Address (rsp, `0`), args[i].first()->as_XMMRegister());
1365	}
1366	}
1367	}
1368
1369	static void restore_args(MacroAssembler masm, int* arg_count, int first_arg, VMRegPair *args) {
1370	for ( int i = arg_count - `1` ; i >= first_arg ; i-- ) {
1371	if (args[i].first()->is_Register()) {
1372	__ pop(args[i].first()->as_Register());
1373	} else if (args[i].first()->is_XMMRegister()) {
1374	__ movdbl(args[i].first()->as_XMMRegister(), Address (rsp, `0`));
1375	__ addptr(rsp, `2`*wordSize);
1376	}
1377	}
1378	}
1379
1380
1381	static void save_or_restore_arguments(MacroAssembler* masm,
1382	const int stack_slots,
1383	const int total_in_args,
1384	const int arg_save_area,
1385	OopMap* map,
1386	VMRegPair* in_regs,
1387	BasicType* in_sig_bt) {
1388	// if map is non-NULL then the code should store the values,
1389	// otherwise it should load them.
1390	int slot = arg_save_area;
1391	// Save down double word first
1392	for ( int i = `0`; i < total_in_args; i++) {
1393	if (in_regs[i].first()->is_XMMRegister() && in_sig_bt[i] == T_DOUBLE) {
1394	int offset = slot * VMRegImpl::stack_slot_size;
1395	slot += VMRegImpl::slots_per_word;
1396	assert(slot <= stack_slots, "overflow");
1397	if (map != NULL) {
1398	__ movdbl(Address (rsp, offset), in_regs[i].first()->as_XMMRegister());
1399	} else {
1400	__ movdbl(in_regs[i].first()->as_XMMRegister(), Address (rsp, offset));
1401	}
1402	}
1403	if (in_regs[i].first()->is_Register() &&
1404	(in_sig_bt[i] == T_LONG \|\| in_sig_bt[i] == T_ARRAY)) {
1405	int offset = slot * VMRegImpl::stack_slot_size;
1406	if (map != NULL) {
1407	__ movq(Address (rsp, offset), in_regs[i].first()->as_Register());
1408	if (in_sig_bt[i] == T_ARRAY) {
1409	map->set_oop(VMRegImpl::stack2reg(slot));;
1410	}
1411	} else {
1412	__ movq(in_regs[i].first()->as_Register(), Address (rsp, offset));
1413	}
1414	slot += VMRegImpl::slots_per_word;
1415	}
1416	}
1417	// Save or restore single word registers
1418	for ( int i = `0`; i < total_in_args; i++) {
1419	if (in_regs[i].first()->is_Register()) {
1420	int offset = slot * VMRegImpl::stack_slot_size;
1421	slot++;
1422	assert(slot <= stack_slots, "overflow");
1423
1424	// Value is in an input register pass we must flush it to the stack
1425	const Register reg = in_regs[i].first()->as_Register();
1426	switch (in_sig_bt[i]) {
1427	case T_BOOLEAN:
1428	case T_CHAR:
1429	case T_BYTE:
1430	case T_SHORT:
1431	case T_INT:
1432	if (map != NULL) {
1433	__ movl(Address (rsp, offset), reg);
1434	} else {
1435	__ movl(reg, Address (rsp, offset));
1436	}
1437	break;
1438	case T_ARRAY:
1439	case T_LONG:
1440	// handled above
1441	break;
1442	case T_OBJECT:
1443	default: ShouldNotReachHere();
1444	}
1445	} else if (in_regs[i].first()->is_XMMRegister()) {
1446	if (in_sig_bt[i] == T_FLOAT) {
1447	int offset = slot * VMRegImpl::stack_slot_size;
1448	slot++;
1449	assert(slot <= stack_slots, "overflow");
1450	if (map != NULL) {
1451	__ movflt(Address (rsp, offset), in_regs[i].first()->as_XMMRegister());
1452	} else {
1453	__ movflt(in_regs[i].first()->as_XMMRegister(), Address (rsp, offset));
1454	}
1455	}
1456	} else if (in_regs[i].first()->is_stack()) {
1457	if (in_sig_bt[i] == T_ARRAY && map != NULL) {
1458	int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1459	map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
1460	}
1461	}
1462	}
1463	}
1464
1465	// Pin object, return pinned object or null in rax
1466	static void gen_pin_object(MacroAssembler* masm,
1467	VMRegPair reg) {
1468	__ block_comment("gen_pin_object {");
1469
1470	// rax always contains oop, either incoming or
1471	// pinned.
1472	Register tmp_reg = rax;
1473
1474	Label is_null;
1475	VMRegPair tmp;
1476	VMRegPair in_reg = reg;
1477
1478	tmp.set_ptr(tmp_reg->as_VMReg());
1479	if (reg.first()->is_stack()) {
1480	// Load the arg up from the stack
1481	move_ptr(masm, reg, tmp);
1482	reg = tmp;
1483	} else {
1484	__ movptr(rax, reg.first()->as_Register());
1485	}
1486	__ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1487	__ jccb(Assembler::equal, is_null);
1488
1489	if (reg.first()->as_Register() != c_rarg1) {
1490	__ movptr(c_rarg1, reg.first()->as_Register());
1491	}
1492
1493	__ call_VM_leaf(
1494	CAST_FROM_FN_PTR(address, SharedRuntime::pin_object),
1495	r15_thread, c_rarg1);
1496
1497	__ bind(is_null);
1498	__ block_comment("} gen_pin_object");
1499	}
1500
1501	// Unpin object
1502	static void gen_unpin_object(MacroAssembler* masm,
1503	VMRegPair reg) {
1504	__ block_comment("gen_unpin_object {");
1505	Label is_null;
1506
1507	if (reg.first()->is_stack()) {
1508	__ movptr(c_rarg1, Address (rbp, reg2offset_in(reg.first())));
1509	} else if (reg.first()->as_Register() != c_rarg1) {
1510	__ movptr(c_rarg1, reg.first()->as_Register());
1511	}
1512
1513	__ testptr(c_rarg1, c_rarg1);
1514	__ jccb(Assembler::equal, is_null);
1515
1516	__ call_VM_leaf(
1517	CAST_FROM_FN_PTR(address, SharedRuntime::unpin_object),
1518	r15_thread, c_rarg1);
1519
1520	__ bind(is_null);
1521	__ block_comment("} gen_unpin_object");
1522	}
1523
1524	// Check GCLocker::needs_gc and enter the runtime if it's true. This
1525	// keeps a new JNI critical region from starting until a GC has been
1526	// forced. Save down any oops in registers and describe them in an
1527	// OopMap.
1528	static void check_needs_gc_for_critical_native(MacroAssembler* masm,
1529	int stack_slots,
1530	int total_c_args,
1531	int total_in_args,
1532	int arg_save_area,
1533	OopMapSet* oop_maps,
1534	VMRegPair* in_regs,
1535	BasicType* in_sig_bt) {
1536	__ block_comment("check GCLocker::needs_gc");
1537	Label cont;
1538	__ cmp8(ExternalAddress ((address)GCLocker::needs_gc_address()), false);
1539	__ jcc(Assembler::equal, cont);
1540
1541	// Save down any incoming oops and call into the runtime to halt for a GC
1542
1543	OopMap* map = new OopMap (stack_slots * `2`, `0` / arg_slots/);
1544	save_or_restore_arguments(masm, stack_slots, total_in_args,
1545	arg_save_area, map, in_regs, in_sig_bt);
1546
1547	address the_pc = __ pc();
1548	oop_maps->add_gc_map( __ offset(), map);
1549	__ set_last_Java_frame(rsp, noreg, the_pc);
1550
1551	__ block_comment("block_for_jni_critical");
1552	__ movptr(c_rarg0, r15_thread);
1553	__ mov(r12, rsp); // remember sp
1554	__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1555	__ andptr(rsp, -`16`); // align stack as required by ABI
1556	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical)));
1557	__ mov(rsp, r12); // restore sp
1558	__ reinit_heapbase();
1559
1560	__ reset_last_Java_frame(false);
1561
1562	save_or_restore_arguments(masm, stack_slots, total_in_args,
1563	arg_save_area, NULL, in_regs, in_sig_bt);
1564	__ bind(cont);
1565	#ifdef ASSERT
1566	if (StressCriticalJNINatives) {
1567	// Stress register saving
1568	OopMap* map = new OopMap(stack_slots * `2`, `0` / arg_slots/);
1569	save_or_restore_arguments(masm, stack_slots, total_in_args,
1570	arg_save_area, map, in_regs, in_sig_bt);
1571	// Destroy argument registers
1572	for (int i = `0`; i < total_in_args - `1`; i++) {
1573	if (in_regs[i].first()->is_Register()) {
1574	const Register reg = in_regs[i].first()->as_Register();
1575	__ xorptr(reg, reg);
1576	} else if (in_regs[i].first()->is_XMMRegister()) {
1577	__ xorpd(in_regs[i].first()->as_XMMRegister(), in_regs[i].first()->as_XMMRegister());
1578	} else if (in_regs[i].first()->is_FloatRegister()) {
1579	ShouldNotReachHere();
1580	} else if (in_regs[i].first()->is_stack()) {
1581	// Nothing to do
1582	} else {
1583	ShouldNotReachHere();
1584	}
1585	if (in_sig_bt[i] == T_LONG \|\| in_sig_bt[i] == T_DOUBLE) {
1586	i++;
1587	}
1588	}
1589
1590	save_or_restore_arguments(masm, stack_slots, total_in_args,
1591	arg_save_area, NULL, in_regs, in_sig_bt);
1592	}
1593	#endif
1594	}
1595
1596	// Unpack an array argument into a pointer to the body and the length
1597	// if the array is non-null, otherwise pass 0 for both.
1598	static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1599	Register tmp_reg = rax;
1600	assert(!body_arg.first()->is_Register() \|\| body_arg.first()->as_Register() != tmp_reg,
1601	"possible collision");
1602	assert(!length_arg.first()->is_Register() \|\| length_arg.first()->as_Register() != tmp_reg,
1603	"possible collision");
1604
1605	__ block_comment("unpack_array_argument {");
1606
1607	// Pass the length, ptr pair
1608	Label is_null, done;
1609	VMRegPair tmp;
1610	tmp.set_ptr(tmp_reg->as_VMReg());
1611	if (reg.first()->is_stack()) {
1612	// Load the arg up from the stack
1613	move_ptr(masm, reg, tmp);
1614	reg = tmp;
1615	}
1616	__ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1617	__ jccb(Assembler::equal, is_null);
1618	__ lea(tmp_reg, Address (reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1619	move_ptr(masm, tmp, body_arg);
1620	// load the length relative to the body.
1621	__ movl(tmp_reg, Address (tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1622	arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1623	move32_64(masm, tmp, length_arg);
1624	__ jmpb(done);
1625	__ bind(is_null);
1626	// Pass zeros
1627	__ xorptr(tmp_reg, tmp_reg);
1628	move_ptr(masm, tmp, body_arg);
1629	move32_64(masm, tmp, length_arg);
1630	__ bind(done);
1631
1632	__ block_comment("} unpack_array_argument");
1633	}
1634
1635
1636	// Different signatures may require very different orders for the move
1637	// to avoid clobbering other arguments. There's no simple way to
1638	// order them safely. Compute a safe order for issuing stores and
1639	// break any cycles in those stores. This code is fairly general but
1640	// it's not necessary on the other platforms so we keep it in the
1641	// platform dependent code instead of moving it into a shared file.
1642	// (See bugs 7013347 & 7145024.)
1643	// Note that this code is specific to LP64.
1644	class ComputeMoveOrder: public StackObj {
1645	class MoveOperation: public ResourceObj {
1646	friend class ComputeMoveOrder;
1647	private:
1648	VMRegPair _src;
1649	VMRegPair _dst;
1650	int _src_index;
1651	int _dst_index;
1652	bool _processed;
1653	MoveOperation* _next;
1654	MoveOperation* _prev;
1655
1656	static int get_id(VMRegPair r) {
1657	return r.first()->value();
1658	}
1659
1660	public:
1661	MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1662	_src (src)
1663	, _dst (dst)
1664	, _src_index(src_index)
1665	, _dst_index(dst_index)
1666	, _processed(false)
1667	, _next(NULL)
1668	, _prev(NULL) {
1669	}
1670
1671	VMRegPair src() const { return _src; }
1672	int src_id() const { return get_id(src()); }
1673	int src_index() const { return _src_index; }
1674	VMRegPair dst() const { return _dst; }
1675	void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1676	int dst_index() const { return _dst_index; }
1677	int dst_id() const { return get_id(dst()); }
1678	MoveOperation* next() const { return _next; }
1679	MoveOperation* prev() const { return _prev; }
1680	void set_processed() { _processed = true; }
1681	bool is_processed() const { return _processed; }
1682
1683	// insert
1684	void break_cycle(VMRegPair temp_register) {
1685	// create a new store following the last store
1686	// to move from the temp_register to the original
1687	MoveOperation* new_store = new MoveOperation (-`1`, temp_register, dst_index(), dst());
1688
1689	// break the cycle of links and insert new_store at the end
1690	// break the reverse link.
1691	MoveOperation* p = prev();
1692	assert(p->next() == this, "must be");
1693	_prev = NULL;
1694	p->_next = new_store;
1695	new_store->_prev = p;
1696
1697	// change the original store to save it's value in the temp.
1698	set_dst(-`1`, temp_register);
1699	}
1700
1701	void link(GrowableArray<MoveOperation*>& killer) {
1702	// link this store in front the store that it depends on
1703	MoveOperation* n = killer.at_grow(src_id(), NULL);
1704	if (n != NULL) {
1705	assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1706	_next = n;
1707	n->_prev = this;
1708	}
1709	}
1710	};
1711
1712	private:
1713	GrowableArray<MoveOperation*> edges;
1714
1715	public:
1716	ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1717	BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1718	// Move operations where the dest is the stack can all be
1719	// scheduled first since they can't interfere with the other moves.
1720	for (int i = total_in_args - `1`, c_arg = total_c_args - `1`; i >= `0`; i--, c_arg--) {
1721	if (in_sig_bt[i] == T_ARRAY) {
1722	c_arg--;
1723	if (out_regs[c_arg].first()->is_stack() &&
1724	out_regs[c_arg + `1`].first()->is_stack()) {
1725	arg_order.push(i);
1726	arg_order.push(c_arg);
1727	} else {
1728	if (out_regs[c_arg].first()->is_stack() \|\|
1729	in_regs[i].first() == out_regs[c_arg].first()) {
1730	add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + `1`]);
1731	} else {
1732	add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1733	}
1734	}
1735	} else if (in_sig_bt[i] == T_VOID) {
1736	arg_order.push(i);
1737	arg_order.push(c_arg);
1738	} else {
1739	if (out_regs[c_arg].first()->is_stack() \|\|
1740	in_regs[i].first() == out_regs[c_arg].first()) {
1741	arg_order.push(i);
1742	arg_order.push(c_arg);
1743	} else {
1744	add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1745	}
1746	}
1747	}
1748	// Break any cycles in the register moves and emit the in the
1749	// proper order.
1750	GrowableArray<MoveOperation> stores = get_store_order(tmp_vmreg);
1751	for (int i = `0`; i < stores->length(); i++) {
1752	arg_order.push(stores->at(i)->src_index());
1753	arg_order.push(stores->at(i)->dst_index());
1754	}
1755	}
1756
1757	// Collected all the move operations
1758	void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1759	if (src.first() == dst.first()) return;
1760	edges.append(new MoveOperation (src_index, src, dst_index, dst));
1761	}
1762
1763	// Walk the edges breaking cycles between moves. The result list
1764	// can be walked in order to produce the proper set of loads
1765	GrowableArray<MoveOperation> get_store_order(VMRegPair temp_register) {
1766	// Record which moves kill which values
1767	GrowableArray<MoveOperation*> killer;
1768	for (int i = `0`; i < edges.length(); i++) {
1769	MoveOperation* s = edges.at(i);
1770	assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1771	killer.at_put_grow(s->dst_id(), s, NULL);
1772	}
1773	assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1774	"make sure temp isn't in the registers that are killed");
1775
1776	// create links between loads and stores
1777	for (int i = `0`; i < edges.length(); i++) {
1778	edges.at(i)->link(killer);
1779	}
1780
1781	// at this point, all the move operations are chained together
1782	// in a doubly linked list. Processing it backwards finds
1783	// the beginning of the chain, forwards finds the end. If there's
1784	// a cycle it can be broken at any point, so pick an edge and walk
1785	// backward until the list ends or we end where we started.
1786	GrowableArray<MoveOperation> stores = new GrowableArray<MoveOperation*>();
1787	for (int e = `0`; e < edges.length(); e++) {
1788	MoveOperation* s = edges.at(e);
1789	if (!s->is_processed()) {
1790	MoveOperation* start = s;
1791	// search for the beginning of the chain or cycle
1792	while (start->prev() != NULL && start->prev() != s) {
1793	start = start->prev();
1794	}
1795	if (start->prev() == s) {
1796	start->break_cycle(temp_register);
1797	}
1798	// walk the chain forward inserting to store list
1799	while (start != NULL) {
1800	stores->append(start);
1801	start->set_processed();
1802	start = start->next();
1803	}
1804	}
1805	}
1806	return stores;
1807	}
1808	};
1809
1810	static void verify_oop_args(MacroAssembler* masm,
1811	const methodHandle& method,
1812	const BasicType* sig_bt,
1813	const VMRegPair* regs) {
1814	Register temp_reg = rbx; // not part of any compiled calling seq
1815	if (VerifyOops) {
1816	for (int i = `0`; i < method ->size_of_parameters(); i++) {
1817	if (sig_bt[i] == T_OBJECT \|\|
1818	sig_bt[i] == T_ARRAY) {
1819	VMReg r = regs[i].first();
1820	assert(r->is_valid(), "bad oop arg");
1821	if (r->is_stack()) {
1822	__ movptr(temp_reg, Address (rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1823	__ verify_oop(temp_reg);
1824	} else {
1825	__ verify_oop(r->as_Register());
1826	}
1827	}
1828	}
1829	}
1830	}
1831
1832	static void gen_special_dispatch(MacroAssembler* masm,
1833	const methodHandle& method,
1834	const BasicType* sig_bt,
1835	const VMRegPair* regs) {
1836	verify_oop_args(masm, method, sig_bt, regs);
1837	vmIntrinsics::ID iid = method ->intrinsic_id();
1838
1839	// Now write the args into the outgoing interpreter space
1840	bool has_receiver = false;
1841	Register receiver_reg = noreg;
1842	int member_arg_pos = -`1`;
1843	Register member_reg = noreg;
1844	int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1845	if (ref_kind != `0`) {
1846	member_arg_pos = method ->size_of_parameters() - `1`; // trailing MemberName argument
1847	member_reg = rbx; // known to be free at this point
1848	has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1849	} else if (iid == vmIntrinsics::_invokeBasic) {
1850	has_receiver = true;
1851	} else {
1852	fatal("unexpected intrinsic id %d", iid);
1853	}
1854
1855	if (member_reg != noreg) {
1856	// Load the member_arg into register, if necessary.
1857	SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1858	VMReg r = regs[member_arg_pos].first();
1859	if (r->is_stack()) {
1860	__ movptr(member_reg, Address (rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1861	} else {
1862	// no data motion is needed
1863	member_reg = r->as_Register();
1864	}
1865	}
1866
1867	if (has_receiver) {
1868	// Make sure the receiver is loaded into a register.
1869	assert(method->size_of_parameters() > `0`, "oob");
1870	assert(sig_bt[`0`] == T_OBJECT, "receiver argument must be an object");
1871	VMReg r = regs[`0`].first();
1872	assert(r->is_valid(), "bad receiver arg");
1873	if (r->is_stack()) {
1874	// Porting note: This assumes that compiled calling conventions always
1875	// pass the receiver oop in a register. If this is not true on some
1876	// platform, pick a temp and load the receiver from stack.
1877	fatal("receiver always in a register");
1878	receiver_reg = j_rarg0; // known to be free at this point
1879	__ movptr(receiver_reg, Address (rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1880	} else {
1881	// no data motion is needed
1882	receiver_reg = r->as_Register();
1883	}
1884	}
1885
1886	// Figure out which address we are really jumping to:
1887	MethodHandles::generate_method_handle_dispatch(masm, iid,
1888	receiver_reg, member_reg, /for_compiler_entry:/ true);
1889	}
1890
1891	// ---------------------------------------------------------------------------
1892	// Generate a native wrapper for a given method. The method takes arguments
1893	// in the Java compiled code convention, marshals them to the native
1894	// convention (handlizes oops, etc), transitions to native, makes the call,
1895	// returns to java state (possibly blocking), unhandlizes any result and
1896	// returns.
1897	//
1898	// Critical native functions are a shorthand for the use of
1899	// GetPrimtiveArrayCritical and disallow the use of any other JNI
1900	// functions. The wrapper is expected to unpack the arguments before
1901	// passing them to the callee and perform checks before and after the
1902	// native call to ensure that they GCLocker
1903	// lock_critical/unlock_critical semantics are followed. Some other
1904	// parts of JNI setup are skipped like the tear down of the JNI handle
1905	// block and the check for pending exceptions it's impossible for them
1906	// to be thrown.
1907	//
1908	// They are roughly structured like this:
1909	// if (GCLocker::needs_gc())
1910	// SharedRuntime::block_for_jni_critical();
1911	// tranistion to thread_in_native
1912	// unpack arrray arguments and call native entry point
1913	// check for safepoint in progress
1914	// check if any thread suspend flags are set
1915	// call into JVM and possible unlock the JNI critical
1916	// if a GC was suppressed while in the critical native.
1917	// transition back to thread_in_Java
1918	// return to caller
1919	//
1920	nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1921	const methodHandle& method,
1922	int compile_id,
1923	BasicType* in_sig_bt,
1924	VMRegPair* in_regs,
1925	BasicType ret_type) {
1926	if (method ->is_method_handle_intrinsic()) {
1927	vmIntrinsics::ID iid = method ->intrinsic_id();
1928	intptr_t start = (intptr_t)__ pc();
1929	int vep_offset = ((intptr_t)__ pc()) - start;
1930	gen_special_dispatch(masm,
1931	method,
1932	in_sig_bt,
1933	in_regs);
1934	int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1935	__ flush();
1936	int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1937	return nmethod::new_native_nmethod(method,
1938	compile_id,
1939	masm->code(),
1940	vep_offset,
1941	frame_complete,
1942	stack_slots / VMRegImpl::slots_per_word,
1943	in_ByteSize(-`1`),
1944	in_ByteSize(-`1`),
1945	(OopMapSet*)NULL);
1946	}
1947	bool is_critical_native = true;
1948	address native_func = method ->critical_native_function();
1949	if (native_func == NULL) {
1950	native_func = method ->native_function();
1951	is_critical_native = false;
1952	}
1953	assert(native_func != NULL, "must have function");
1954
1955	// An OopMap for lock (and class if static)
1956	OopMapSet oop_maps = new* OopMapSet ();
1957	intptr_t start = (intptr_t)__ pc();
1958
1959	// We have received a description of where all the java arg are located
1960	// on entry to the wrapper. We need to convert these args to where
1961	// the jni function will expect them. To figure out where they go
1962	// we convert the java signature to a C signature by inserting
1963	// the hidden arguments as arg[0] and possibly arg[1] (static method)
1964
1965	const int total_in_args = method ->size_of_parameters();
1966	int total_c_args = total_in_args;
1967	if (!is_critical_native) {
1968	total_c_args += `1`;
1969	if (method ->is_static()) {
1970	total_c_args++;
1971	}
1972	} else {
1973	for (int i = `0`; i < total_in_args; i++) {
1974	if (in_sig_bt[i] == T_ARRAY) {
1975	total_c_args++;
1976	}
1977	}
1978	}
1979
1980	BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1981	VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1982	BasicType* in_elem_bt = NULL;
1983
1984	int argc = `0`;
1985	if (!is_critical_native) {
1986	out_sig_bt[argc++] = T_ADDRESS;
1987	if (method ->is_static()) {
1988	out_sig_bt[argc++] = T_OBJECT;
1989	}
1990
1991	for (int i = `0`; i < total_in_args ; i++ ) {
1992	out_sig_bt[argc++] = in_sig_bt[i];
1993	}
1994	} else {
1995	in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1996	SignatureStream ss(method ->signature());
1997	for (int i = `0`; i < total_in_args ; i++ ) {
1998	if (in_sig_bt[i] == T_ARRAY) {
1999	// Arrays are passed as int, elem pair*
2000	out_sig_bt[argc++] = T_INT;
2001	out_sig_bt[argc++] = T_ADDRESS;
2002	Symbol* atype = ss.as_symbol();
2003	const char* at = atype->as_C_string();
2004	if (strlen(at) == `2`) {
2005	assert(at[`0`] == `'['`, "must be");
2006	switch (at[`1`]) {
2007	case `'B'`: in_elem_bt[i] = T_BYTE; break;
2008	case `'C'`: in_elem_bt[i] = T_CHAR; break;
2009	case `'D'`: in_elem_bt[i] = T_DOUBLE; break;
2010	case `'F'`: in_elem_bt[i] = T_FLOAT; break;
2011	case `'I'`: in_elem_bt[i] = T_INT; break;
2012	case `'J'`: in_elem_bt[i] = T_LONG; break;
2013	case `'S'`: in_elem_bt[i] = T_SHORT; break;
2014	case `'Z'`: in_elem_bt[i] = T_BOOLEAN; break;
2015	default: ShouldNotReachHere();
2016	}
2017	}
2018	} else {
2019	out_sig_bt[argc++] = in_sig_bt[i];
2020	in_elem_bt[i] = T_VOID;
2021	}
2022	if (in_sig_bt[i] != T_VOID) {
2023	assert(in_sig_bt[i] == ss.type(), "must match");
2024	ss.next();
2025	}
2026	}
2027	}
2028
2029	// Now figure out where the args must be stored and how much stack space
2030	// they require.
2031	int out_arg_slots;
2032	out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
2033
2034	// Compute framesize for the wrapper. We need to handlize all oops in
2035	// incoming registers
2036
2037	// Calculate the total number of stack slots we will need.
2038
2039	// First count the abi requirement plus all of the outgoing args
2040	int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2041
2042	// Now the space for the inbound oop handle area
2043	int total_save_slots = `6` * VMRegImpl::slots_per_word; // 6 arguments passed in registers
2044	if (is_critical_native) {
2045	// Critical natives may have to call out so they need a save area
2046	// for register arguments.
2047	int double_slots = `0`;
2048	int single_slots = `0`;
2049	for ( int i = `0`; i < total_in_args; i++) {
2050	if (in_regs[i].first()->is_Register()) {
2051	const Register reg = in_regs[i].first()->as_Register();
2052	switch (in_sig_bt[i]) {
2053	case T_BOOLEAN:
2054	case T_BYTE:
2055	case T_SHORT:
2056	case T_CHAR:
2057	case T_INT: single_slots++; break;
2058	case T_ARRAY: // specific to LP64 (7145024)
2059	case T_LONG: double_slots++; break;
2060	default: ShouldNotReachHere();
2061	}
2062	} else if (in_regs[i].first()->is_XMMRegister()) {
2063	switch (in_sig_bt[i]) {
2064	case T_FLOAT: single_slots++; break;
2065	case T_DOUBLE: double_slots++; break;
2066	default: ShouldNotReachHere();
2067	}
2068	} else if (in_regs[i].first()->is_FloatRegister()) {
2069	ShouldNotReachHere();
2070	}
2071	}
2072	total_save_slots = double_slots * `2` + single_slots;
2073	// align the save area
2074	if (double_slots != `0`) {
2075	stack_slots = align_up(stack_slots, `2`);
2076	}
2077	}
2078
2079	int oop_handle_offset = stack_slots;
2080	stack_slots += total_save_slots;
2081
2082	// Now any space we need for handlizing a klass if static method
2083
2084	int klass_slot_offset = `0`;
2085	int klass_offset = -`1`;
2086	int lock_slot_offset = `0`;
2087	bool is_static = false;
2088
2089	if (method ->is_static()) {
2090	klass_slot_offset = stack_slots;
2091	stack_slots += VMRegImpl::slots_per_word;
2092	klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2093	is_static = true;
2094	}
2095
2096	// Plus a lock if needed
2097
2098	if (method ->is_synchronized()) {
2099	lock_slot_offset = stack_slots;
2100	stack_slots += VMRegImpl::slots_per_word;
2101	}
2102
2103	// Now a place (+2) to save return values or temp during shuffling
2104	// + 4 for return address (which we own) and saved rbp
2105	stack_slots += `6`;
2106
2107	// Ok The space we have allocated will look like:
2108	//
2109	//
2110	// FP-> \| \|
2111	// \|---------------------\|
2112	// \| 2 slots for moves \|
2113	// \|---------------------\|
2114	// \| lock box (if sync) \|
2115	// \|---------------------\| <- lock_slot_offset
2116	// \| klass (if static) \|
2117	// \|---------------------\| <- klass_slot_offset
2118	// \| oopHandle area \|
2119	// \|---------------------\| <- oop_handle_offset (6 java arg registers)
2120	// \| outbound memory \|
2121	// \| based arguments \|
2122	// \| \|
2123	// \|---------------------\|
2124	// \| \|
2125	// SP-> \| out_preserved_slots \|
2126	//
2127	//
2128
2129
2130	// Now compute actual number of stack words we need rounding to make
2131	// stack properly aligned.
2132	stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2133
2134	int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2135
2136	// First thing make an ic check to see if we should even be here
2137
2138	// We are free to use all registers as temps without saving them and
2139	// restoring them except rbp. rbp is the only callee save register
2140	// as far as the interpreter and the compiler(s) are concerned.
2141
2142
2143	const Register ic_reg = rax;
2144	const Register receiver = j_rarg0;
2145
2146	Label hit;
2147	Label exception_pending;
2148
2149	assert_different_registers(ic_reg, receiver, rscratch1);
2150	__ verify_oop(receiver);
2151	__ load_klass(rscratch1, receiver);
2152	__ cmpq(ic_reg, rscratch1);
2153	__ jcc(Assembler::equal, hit);
2154
2155	__ jump(RuntimeAddress (SharedRuntime::get_ic_miss_stub()));
2156
2157	// Verified entry point must be aligned
2158	__ align(`8`);
2159
2160	__ bind(hit);
2161
2162	int vep_offset = ((intptr_t)__ pc()) - start;
2163
2164	if (VM_Version::supports_fast_class_init_checks() && method ->needs_clinit_barrier()) {
2165	Label L_skip_barrier;
2166	Register klass = r10;
2167	__ mov_metadata(klass, method ->method_holder()); // InstanceKlass*
2168	__ clinit_barrier(klass, r15_thread, &L_skip_barrier /L_fast_path/);
2169
2170	__ jump(RuntimeAddress (SharedRuntime::get_handle_wrong_method_stub())); // slow path
2171
2172	__ bind(L_skip_barrier);
2173	}
2174
2175	#ifdef COMPILER1
2176	// For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2177	if ((InlineObjectHash && method ->intrinsic_id() == vmIntrinsics::_hashCode) \|\| (method ->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2178	inline_check_hashcode_from_object_header(masm, method, j_rarg0 /obj_reg/, rax /result/);
2179	}
2180	#endif // COMPILER1
2181
2182	// The instruction at the verified entry point must be 5 bytes or longer
2183	// because it can be patched on the fly by make_non_entrant. The stack bang
2184	// instruction fits that requirement.
2185
2186	// Generate stack overflow check
2187
2188	if (UseStackBanging) {
2189	__ bang_stack_with_offset((int)JavaThread::stack_shadow_zone_size());
2190	} else {
2191	// need a 5 byte instruction to allow MT safe patching to non-entrant
2192	__ fat_nop();
2193	}
2194
2195	// Generate a new frame for the wrapper.
2196	__ enter();
2197	// -2 because return address is already present and so is saved rbp
2198	__ subptr(rsp, stack_size - `2`*wordSize);
2199
2200	BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2201	bs->nmethod_entry_barrier(masm);
2202
2203	// Frame is now completed as far as size and linkage.
2204	int frame_complete = ((intptr_t)__ pc()) - start;
2205
2206	if (UseRTMLocking) {
2207	// Abort RTM transaction before calling JNI
2208	// because critical section will be large and will be
2209	// aborted anyway. Also nmethod could be deoptimized.
2210	__ xabort(`0`);
2211	}
2212
2213	#ifdef ASSERT
2214	{
2215	Label L;
2216	__ mov(rax, rsp);
2217	__ andptr(rax, -`16`); // must be 16 byte boundary (see amd64 ABI)
2218	__ cmpptr(rax, rsp);
2219	__ jcc(Assembler::equal, L);
2220	__ stop("improperly aligned stack");
2221	__ bind(L);
2222	}
2223	#endif /* ASSERT */
2224
2225
2226	// We use r14 as the oop handle for the receiver/klass
2227	// It is callee save so it survives the call to native
2228
2229	const Register oop_handle_reg = r14;
2230
2231	if (is_critical_native && !Universe::heap()->supports_object_pinning()) {
2232	check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
2233	oop_handle_offset, oop_maps, in_regs, in_sig_bt);
2234	}
2235
2236	//
2237	// We immediately shuffle the arguments so that any vm call we have to
2238	// make from here on out (sync slow path, jvmti, etc.) we will have
2239	// captured the oops from our caller and have a valid oopMap for
2240	// them.
2241
2242	// -----------------
2243	// The Grand Shuffle
2244
2245	// The Java calling convention is either equal (linux) or denser (win64) than the
2246	// c calling convention. However the because of the jni_env argument the c calling
2247	// convention always has at least one more (and two for static) arguments than Java.
2248	// Therefore if we move the args from java -> c backwards then we will never have
2249	// a register->register conflict and we don't have to build a dependency graph
2250	// and figure out how to break any cycles.
2251	//
2252
2253	// Record esp-based slot for receiver on stack for non-static methods
2254	int receiver_offset = -`1`;
2255
2256	// This is a trick. We double the stack slots so we can claim
2257	// the oops in the caller's frame. Since we are sure to have
2258	// more args than the caller doubling is enough to make
2259	// sure we can capture all the incoming oop args from the
2260	// caller.
2261	//
2262	OopMap* map = new OopMap (stack_slots * `2`, `0` / arg_slots/);
2263
2264	// Mark location of rbp (someday)
2265	// map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots 2, 0, vmreg(rbp));*
2266
2267	// Use eax, ebx as temporaries during any memory-memory moves we have to do
2268	// All inbound args are referenced based on rbp and all outbound args via rsp.
2269
2270
2271	#ifdef ASSERT
2272	bool reg_destroyed[RegisterImpl::number_of_registers];
2273	bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2274	for ( int r = `0` ; r < RegisterImpl::number_of_registers ; r++ ) {
2275	reg_destroyed[r] = false;
2276	}
2277	for ( int f = `0` ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2278	freg_destroyed[f] = false;
2279	}
2280
2281	#endif /* ASSERT */
2282
2283	// This may iterate in two different directions depending on the
2284	// kind of native it is. The reason is that for regular JNI natives
2285	// the incoming and outgoing registers are offset upwards and for
2286	// critical natives they are offset down.
2287	GrowableArray<int> arg_order(`2` * total_in_args);
2288	// Inbound arguments that need to be pinned for critical natives
2289	GrowableArray<int> pinned_args(total_in_args);
2290	// Current stack slot for storing register based array argument
2291	int pinned_slot = oop_handle_offset;
2292
2293	VMRegPair tmp_vmreg;
2294	tmp_vmreg.set2(rbx->as_VMReg());
2295
2296	if (!is_critical_native) {
2297	for (int i = total_in_args - `1`, c_arg = total_c_args - `1`; i >= `0`; i--, c_arg--) {
2298	arg_order.push(i);
2299	arg_order.push(c_arg);
2300	}
2301	} else {
2302	// Compute a valid move order, using tmp_vmreg to break any cycles
2303	ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2304	}
2305
2306	int temploc = -`1`;
2307	for (int ai = `0`; ai < arg_order.length(); ai += `2`) {
2308	int i = arg_order.at(ai);
2309	int c_arg = arg_order.at(ai + `1`);
2310	__ block_comment(err_msg ("move %d -> %d", i, c_arg));
2311	if (c_arg == -`1`) {
2312	assert(is_critical_native, "should only be required for critical natives");
2313	// This arg needs to be moved to a temporary
2314	__ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2315	in_regs[i] = tmp_vmreg;
2316	temploc = i;
2317	continue;
2318	} else if (i == -`1`) {
2319	assert(is_critical_native, "should only be required for critical natives");
2320	// Read from the temporary location
2321	assert(temploc != -`1`, "must be valid");
2322	i = temploc;
2323	temploc = -`1`;
2324	}
2325	#ifdef ASSERT
2326	if (in_regs[i].first()->is_Register()) {
2327	assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2328	} else if (in_regs[i].first()->is_XMMRegister()) {
2329	assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2330	}
2331	if (out_regs[c_arg].first()->is_Register()) {
2332	reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2333	} else if (out_regs[c_arg].first()->is_XMMRegister()) {
2334	freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2335	}
2336	#endif /* ASSERT */
2337	switch (in_sig_bt[i]) {
2338	case T_ARRAY:
2339	if (is_critical_native) {
2340	// pin before unpack
2341	if (Universe::heap()->supports_object_pinning()) {
2342	save_args(masm, total_c_args, `0`, out_regs);
2343	gen_pin_object(masm, in_regs[i]);
2344	pinned_args.append(i);
2345	restore_args(masm, total_c_args, `0`, out_regs);
2346
2347	// rax has pinned array
2348	VMRegPair result_reg;
2349	result_reg.set_ptr(rax->as_VMReg());
2350	move_ptr(masm, result_reg, in_regs[i]);
2351	if (!in_regs[i].first()->is_stack()) {
2352	assert(pinned_slot <= stack_slots, "overflow");
2353	move_ptr(masm, result_reg, VMRegImpl::stack2reg(pinned_slot));
2354	pinned_slot += VMRegImpl::slots_per_word;
2355	}
2356	}
2357	unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + `1`], out_regs[c_arg]);
2358	c_arg++;
2359	#ifdef ASSERT
2360	if (out_regs[c_arg].first()->is_Register()) {
2361	reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2362	} else if (out_regs[c_arg].first()->is_XMMRegister()) {
2363	freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2364	}
2365	#endif
2366	break;
2367	}
2368	case T_OBJECT:
2369	assert(!is_critical_native, "no oop arguments");
2370	object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2371	((i == `0`) && (!is_static)),
2372	&receiver_offset);
2373	break;
2374	case T_VOID:
2375	break;
2376
2377	case T_FLOAT:
2378	float_move(masm, in_regs[i], out_regs[c_arg]);
2379	break;
2380
2381	case T_DOUBLE:
2382	assert( i + `1` < total_in_args &&
2383	in_sig_bt[i + `1`] == T_VOID &&
2384	out_sig_bt[c_arg+`1`] == T_VOID, "bad arg list");
2385	double_move(masm, in_regs[i], out_regs[c_arg]);
2386	break;
2387
2388	case T_LONG :
2389	long_move(masm, in_regs[i], out_regs[c_arg]);
2390	break;
2391
2392	case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2393
2394	default:
2395	move32_64(masm, in_regs[i], out_regs[c_arg]);
2396	}
2397	}
2398
2399	int c_arg;
2400
2401	// Pre-load a static method's oop into r14. Used both by locking code and
2402	// the normal JNI call code.
2403	if (!is_critical_native) {
2404	// point c_arg at the first arg that is already loaded in case we
2405	// need to spill before we call out
2406	c_arg = total_c_args - total_in_args;
2407
2408	if (method ->is_static()) {
2409
2410	// load oop into a register
2411	__ movoop(oop_handle_reg, JNIHandles::make_local(method ->method_holder()->java_mirror()));
2412
2413	// Now handlize the static class mirror it's known not-null.
2414	__ movptr(Address (rsp, klass_offset), oop_handle_reg);
2415	map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2416
2417	// Now get the handle
2418	__ lea(oop_handle_reg, Address (rsp, klass_offset));
2419	// store the klass handle as second argument
2420	__ movptr(c_rarg1, oop_handle_reg);
2421	// and protect the arg if we must spill
2422	c_arg--;
2423	}
2424	} else {
2425	// For JNI critical methods we need to save all registers in save_args.
2426	c_arg = `0`;
2427	}
2428
2429	// Change state to native (we save the return address in the thread, since it might not
2430	// be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2431	// points into the right code segment. It does not have to be the correct return pc.
2432	// We use the same pc/oopMap repeatedly when we call out
2433
2434	intptr_t the_pc = (intptr_t) __ pc();
2435	oop_maps->add_gc_map(the_pc - start, map);
2436
2437	__ set_last_Java_frame(rsp, noreg, (address)the_pc);
2438
2439
2440	// We have all of the arguments setup at this point. We must not touch any register
2441	// argument registers at this point (what if we save/restore them there are no oop?
2442
2443	{
2444	SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2445	// protect the args we've loaded
2446	save_args(masm, total_c_args, c_arg, out_regs);
2447	__ mov_metadata(c_rarg1, method ());
2448	__ call_VM_leaf(
2449	CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2450	r15_thread, c_rarg1);
2451	restore_args(masm, total_c_args, c_arg, out_regs);
2452	}
2453
2454	// RedefineClasses() tracing support for obsolete method entry
2455	if (log_is_enabled(Trace, redefine, class, obsolete)) {
2456	// protect the args we've loaded
2457	save_args(masm, total_c_args, c_arg, out_regs);
2458	__ mov_metadata(c_rarg1, method ());
2459	__ call_VM_leaf(
2460	CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2461	r15_thread, c_rarg1);
2462	restore_args(masm, total_c_args, c_arg, out_regs);
2463	}
2464
2465	// Lock a synchronized method
2466
2467	// Register definitions used by locking and unlocking
2468
2469	const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2470	const Register obj_reg = rbx; // Will contain the oop
2471	const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2472	const Register old_hdr = r13; // value of old header at unlock time
2473
2474	Label slow_path_lock;
2475	Label lock_done;
2476
2477	if (method ->is_synchronized()) {
2478	assert(!is_critical_native, "unhandled");
2479
2480
2481	const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2482
2483	// Get the handle (the 2nd argument)
2484	__ mov(oop_handle_reg, c_rarg1);
2485
2486	// Get address of the box
2487
2488	__ lea(lock_reg, Address (rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2489
2490	// Load the oop from the handle
2491	__ movptr(obj_reg, Address (oop_handle_reg, `0`));
2492
2493	__ resolve(IS_NOT_NULL, obj_reg);
2494	if (UseBiasedLocking) {
2495	__ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
2496	}
2497
2498	// Load immediate 1 into swap_reg %rax
2499	__ movl(swap_reg, `1`);
2500
2501	// Load (object->mark() \| 1) into swap_reg %rax
2502	__ orptr(swap_reg, Address (obj_reg, oopDesc::mark_offset_in_bytes()));
2503
2504	// Save (object->mark() \| 1) into BasicLock's displaced header
2505	__ movptr(Address (lock_reg, mark_word_offset), swap_reg);
2506
2507	// src -> dest iff dest == rax else rax <- dest
2508	__ lock();
2509	__ cmpxchgptr(lock_reg, Address (obj_reg, oopDesc::mark_offset_in_bytes()));
2510	__ jcc(Assembler::equal, lock_done);
2511
2512	// Hmm should this move to the slow path code area???
2513
2514	// Test if the oopMark is an obvious stack pointer, i.e.,
2515	// 1) (mark & 3) == 0, and
2516	// 2) rsp <= mark < mark + os::pagesize()
2517	// These 3 tests can be done by evaluating the following
2518	// expression: ((mark - rsp) & (3 - os::vm_page_size())),
2519	// assuming both stack pointer and pagesize have their
2520	// least significant 2 bits clear.
2521	// NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2522
2523	__ subptr(swap_reg, rsp);
2524	__ andptr(swap_reg, `3` - os::vm_page_size());
2525
2526	// Save the test result, for recursive case, the result is zero
2527	__ movptr(Address (lock_reg, mark_word_offset), swap_reg);
2528	__ jcc(Assembler::notEqual, slow_path_lock);
2529
2530	// Slow path will re-enter here
2531
2532	__ bind(lock_done);
2533	}
2534
2535
2536	// Finally just about ready to make the JNI call
2537
2538
2539	// get JNIEnv which is first argument to native*
2540	if (!is_critical_native) {
2541	__ lea(c_rarg0, Address (r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2542	}
2543
2544	// Now set thread in native
2545	__ movl(Address (r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2546
2547	__ call(RuntimeAddress (native_func));
2548
2549	// Verify or restore cpu control state after JNI call
2550	__ restore_cpu_control_state_after_jni();
2551
2552	// Unpack native results.
2553	switch (ret_type) {
2554	case T_BOOLEAN: __ c2bool(rax); break;
2555	case T_CHAR : __ movzwl(rax, rax); break;
2556	case T_BYTE : __ sign_extend_byte (rax); break;
2557	case T_SHORT : __ sign_extend_short(rax); break;
2558	case T_INT : / nothing to do / break;
2559	case T_DOUBLE :
2560	case T_FLOAT :
2561	// Result is in xmm0 we'll save as needed
2562	break;
2563	case T_ARRAY: // Really a handle
2564	case T_OBJECT: // Really a handle
2565	break; // can't de-handlize until after safepoint check
2566	case T_VOID: break;
2567	case T_LONG: break;
2568	default : ShouldNotReachHere();
2569	}
2570
2571	// unpin pinned arguments
2572	pinned_slot = oop_handle_offset;
2573	if (pinned_args.length() > `0`) {
2574	// save return value that may be overwritten otherwise.
2575	save_native_result(masm, ret_type, stack_slots);
2576	for (int index = `0`; index < pinned_args.length(); index ++) {
2577	int i = pinned_args.at(index);
2578	assert(pinned_slot <= stack_slots, "overflow");
2579	if (!in_regs[i].first()->is_stack()) {
2580	int offset = pinned_slot * VMRegImpl::stack_slot_size;
2581	__ movq(in_regs[i].first()->as_Register(), Address (rsp, offset));
2582	pinned_slot += VMRegImpl::slots_per_word;
2583	}
2584	gen_unpin_object(masm, in_regs[i]);
2585	}
2586	restore_native_result(masm, ret_type, stack_slots);
2587	}
2588
2589	// Switch thread to "native transition" state before reading the synchronization state.
2590	// This additional state is necessary because reading and testing the synchronization
2591	// state is not atomic w.r.t. GC, as this scenario demonstrates:
2592	// Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2593	// VM thread changes sync state to synchronizing and suspends threads for GC.
2594	// Thread A is resumed to finish this native method, but doesn't block here since it
2595	// didn't see any synchronization is progress, and escapes.
2596	__ movl(Address (r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2597
2598	// Force this write out before the read below
2599	__ membar(Assembler::Membar_mask_bits(
2600	Assembler::LoadLoad \| Assembler::LoadStore \|
2601	Assembler::StoreLoad \| Assembler::StoreStore));
2602
2603	Label after_transition;
2604
2605	// check for safepoint operation in progress and/or pending suspend requests
2606	{
2607	Label Continue;
2608	Label slow_path;
2609
2610	__ safepoint_poll(slow_path, r15_thread, rscratch1);
2611
2612	__ cmpl(Address (r15_thread, JavaThread::suspend_flags_offset()), `0`);
2613	__ jcc(Assembler::equal, Continue);
2614	__ bind(slow_path);
2615
2616	// Don't use call_VM as it will see a possible pending exception and forward it
2617	// and never return here preventing us from clearing _last_native_pc down below.
2618	// Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2619	// preserved and correspond to the bcp/locals pointers. So we do a runtime call
2620	// by hand.
2621	//
2622	__ vzeroupper();
2623	save_native_result(masm, ret_type, stack_slots);
2624	__ mov(c_rarg0, r15_thread);
2625	__ mov(r12, rsp); // remember sp
2626	__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2627	__ andptr(rsp, -`16`); // align stack as required by ABI
2628	if (!is_critical_native) {
2629	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2630	} else {
2631	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
2632	}
2633	__ mov(rsp, r12); // restore sp
2634	__ reinit_heapbase();
2635	// Restore any method result value
2636	restore_native_result(masm, ret_type, stack_slots);
2637
2638	if (is_critical_native) {
2639	// The call above performed the transition to thread_in_Java so
2640	// skip the transition logic below.
2641	__ jmpb(after_transition);
2642	}
2643
2644	__ bind(Continue);
2645	}
2646
2647	// change thread state
2648	__ movl(Address (r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2649	__ bind(after_transition);
2650
2651	Label reguard;
2652	Label reguard_done;
2653	__ cmpl(Address (r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_reserved_disabled);
2654	__ jcc(Assembler::equal, reguard);
2655	__ bind(reguard_done);
2656
2657	// native result if any is live
2658
2659	// Unlock
2660	Label unlock_done;
2661	Label slow_path_unlock;
2662	if (method ->is_synchronized()) {
2663
2664	// Get locked oop from the handle we passed to jni
2665	__ movptr(obj_reg, Address (oop_handle_reg, `0`));
2666	__ resolve(IS_NOT_NULL, obj_reg);
2667
2668	Label done;
2669
2670	if (UseBiasedLocking) {
2671	__ biased_locking_exit(obj_reg, old_hdr, done);
2672	}
2673
2674	// Simple recursive lock?
2675
2676	__ cmpptr(Address (rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2677	__ jcc(Assembler::equal, done);
2678
2679	// Must save rax if if it is live now because cmpxchg must use it
2680	if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2681	save_native_result(masm, ret_type, stack_slots);
2682	}
2683
2684
2685	// get address of the stack lock
2686	__ lea(rax, Address (rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2687	// get old displaced header
2688	__ movptr(old_hdr, Address (rax, `0`));
2689
2690	// Atomic swap old header if oop still contains the stack lock
2691	__ lock();
2692	__ cmpxchgptr(old_hdr, Address (obj_reg, oopDesc::mark_offset_in_bytes()));
2693	__ jcc(Assembler::notEqual, slow_path_unlock);
2694
2695	// slow path re-enters here
2696	__ bind(unlock_done);
2697	if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2698	restore_native_result(masm, ret_type, stack_slots);
2699	}
2700
2701	__ bind(done);
2702
2703	}
2704	{
2705	SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2706	save_native_result(masm, ret_type, stack_slots);
2707	__ mov_metadata(c_rarg1, method ());
2708	__ call_VM_leaf(
2709	CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2710	r15_thread, c_rarg1);
2711	restore_native_result(masm, ret_type, stack_slots);
2712	}
2713
2714	__ reset_last_Java_frame(false);
2715
2716	// Unbox oop result, e.g. JNIHandles::resolve value.
2717	if (ret_type == T_OBJECT \|\| ret_type == T_ARRAY) {
2718	__ resolve_jobject(rax / value /,
2719	r15_thread / thread /,
2720	rcx / tmp /);
2721	}
2722
2723	if (CheckJNICalls) {
2724	// clear_pending_jni_exception_check
2725	__ movptr(Address (r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2726	}
2727
2728	if (!is_critical_native) {
2729	// reset handle block
2730	__ movptr(rcx, Address (r15_thread, JavaThread::active_handles_offset()));
2731	__ movl(Address (rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2732	}
2733
2734	// pop our frame
2735
2736	__ leave();
2737
2738	if (!is_critical_native) {
2739	// Any exception pending?
2740	__ cmpptr(Address (r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2741	__ jcc(Assembler::notEqual, exception_pending);
2742	}
2743
2744	// Return
2745
2746	__ ret(`0`);
2747
2748	// Unexpected paths are out of line and go here
2749
2750	if (!is_critical_native) {
2751	// forward the exception
2752	__ bind(exception_pending);
2753
2754	// and forward the exception
2755	__ jump(RuntimeAddress (StubRoutines::forward_exception_entry()));
2756	}
2757
2758	// Slow path locking & unlocking
2759	if (method ->is_synchronized()) {
2760
2761	// BEGIN Slow path lock
2762	__ bind(slow_path_lock);
2763
2764	// has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2765	// args are (oop obj, BasicLock lock, JavaThread* thread)*
2766
2767	// protect the args we've loaded
2768	save_args(masm, total_c_args, c_arg, out_regs);
2769
2770	__ mov(c_rarg0, obj_reg);
2771	__ mov(c_rarg1, lock_reg);
2772	__ mov(c_rarg2, r15_thread);
2773
2774	// Not a leaf but we have last_Java_frame setup as we want
2775	__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), `3`);
2776	restore_args(masm, total_c_args, c_arg, out_regs);
2777
2778	#ifdef ASSERT
2779	{ Label L;
2780	__ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2781	__ jcc(Assembler::equal, L);
2782	__ stop("no pending exception allowed on exit from monitorenter");
2783	__ bind(L);
2784	}
2785	#endif
2786	__ jmp(lock_done);
2787
2788	// END Slow path lock
2789
2790	// BEGIN Slow path unlock
2791	__ bind(slow_path_unlock);
2792
2793	// If we haven't already saved the native result we must save it now as xmm registers
2794	// are still exposed.
2795	__ vzeroupper();
2796	if (ret_type == T_FLOAT \|\| ret_type == T_DOUBLE ) {
2797	save_native_result(masm, ret_type, stack_slots);
2798	}
2799
2800	__ lea(c_rarg1, Address (rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2801
2802	__ mov(c_rarg0, obj_reg);
2803	__ mov(c_rarg2, r15_thread);
2804	__ mov(r12, rsp); // remember sp
2805	__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2806	__ andptr(rsp, -`16`); // align stack as required by ABI
2807
2808	// Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2809	// NOTE that obj_reg == rbx currently
2810	__ movptr(rbx, Address (r15_thread, in_bytes(Thread::pending_exception_offset())));
2811	__ movptr(Address (r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2812
2813	// args are (oop obj, BasicLock lock, JavaThread* thread)*
2814	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2815	__ mov(rsp, r12); // restore sp
2816	__ reinit_heapbase();
2817	#ifdef ASSERT
2818	{
2819	Label L;
2820	__ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2821	__ jcc(Assembler::equal, L);
2822	__ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2823	__ bind(L);
2824	}
2825	#endif /* ASSERT */
2826
2827	__ movptr(Address (r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2828
2829	if (ret_type == T_FLOAT \|\| ret_type == T_DOUBLE ) {
2830	restore_native_result(masm, ret_type, stack_slots);
2831	}
2832	__ jmp(unlock_done);
2833
2834	// END Slow path unlock
2835
2836	} // synchronized
2837
2838	// SLOW PATH Reguard the stack if needed
2839
2840	__ bind(reguard);
2841	__ vzeroupper();
2842	save_native_result(masm, ret_type, stack_slots);
2843	__ mov(r12, rsp); // remember sp
2844	__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2845	__ andptr(rsp, -`16`); // align stack as required by ABI
2846	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2847	__ mov(rsp, r12); // restore sp
2848	__ reinit_heapbase();
2849	restore_native_result(masm, ret_type, stack_slots);
2850	// and continue
2851	__ jmp(reguard_done);
2852
2853
2854
2855	__ flush();
2856
2857	nmethod *nm = nmethod::new_native_nmethod(method,
2858	compile_id,
2859	masm->code(),
2860	vep_offset,
2861	frame_complete,
2862	stack_slots / VMRegImpl::slots_per_word,
2863	(is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2864	in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2865	oop_maps);
2866
2867	if (is_critical_native) {
2868	nm->set_lazy_critical_native(true);
2869	}
2870
2871	return nm;
2872
2873	}
2874
2875	// this function returns the adjust size (in number of words) to a c2i adapter
2876	// activation for use during deoptimization
2877	int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2878	return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2879	}
2880
2881
2882	uint SharedRuntime::out_preserve_stack_slots() {
2883	return `0`;
2884	}
2885
2886	//------------------------------generate_deopt_blob----------------------------
2887	void SharedRuntime::generate_deopt_blob() {
2888	// Allocate space for the code
2889	ResourceMark rm;
2890	// Setup code generation tools
2891	int pad = `0`;
2892	#if INCLUDE_JVMCI
2893	if (EnableJVMCI \|\| UseAOT) {
2894	pad += `512`; // Increase the buffer size when compiling for JVMCI
2895	}
2896	#endif
2897	CodeBuffer buffer("deopt_blob", `2048`+pad, `1024`);
2898	MacroAssembler* masm = new MacroAssembler (&buffer);
2899	int frame_size_in_words;
2900	OopMap* map = NULL;
2901	OopMapSet oop_maps = new* OopMapSet ();
2902
2903	// -------------
2904	// This code enters when returning to a de-optimized nmethod. A return
2905	// address has been pushed on the the stack, and return values are in
2906	// registers.
2907	// If we are doing a normal deopt then we were called from the patched
2908	// nmethod from the point we returned to the nmethod. So the return
2909	// address on the stack is wrong by NativeCall::instruction_size
2910	// We will adjust the value so it looks like we have the original return
2911	// address on the stack (like when we eagerly deoptimized).
2912	// In the case of an exception pending when deoptimizing, we enter
2913	// with a return address on the stack that points after the call we patched
2914	// into the exception handler. We have the following register state from,
2915	// e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2916	// rax: exception oop
2917	// rbx: exception handler
2918	// rdx: throwing pc
2919	// So in this case we simply jam rdx into the useless return address and
2920	// the stack looks just like we want.
2921	//
2922	// At this point we need to de-opt. We save the argument return
2923	// registers. We call the first C routine, fetch_unroll_info(). This
2924	// routine captures the return values and returns a structure which
2925	// describes the current frame size and the sizes of all replacement frames.
2926	// The current frame is compiled code and may contain many inlined
2927	// functions, each with their own JVM state. We pop the current frame, then
2928	// push all the new frames. Then we call the C routine unpack_frames() to
2929	// populate these frames. Finally unpack_frames() returns us the new target
2930	// address. Notice that callee-save registers are BLOWN here; they have
2931	// already been captured in the vframeArray at the time the return PC was
2932	// patched.
2933	address start = __ pc();
2934	Label cont;
2935
2936	// Prolog for non exception case!
2937
2938	// Save everything in sight.
2939	map = RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words);
2940
2941	// Normal deoptimization. Save exec mode for unpack_frames.
2942	__ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2943	__ jmp(cont);
2944
2945	int reexecute_offset = __ pc() - start;
2946	#if INCLUDE_JVMCI && !defined(COMPILER1)
2947	if (EnableJVMCI && UseJVMCICompiler) {
2948	// JVMCI does not use this kind of deoptimization
2949	__ should_not_reach_here();
2950	}
2951	#endif
2952
2953	// Reexecute case
2954	// return address is the pc describes what bci to do re-execute at
2955
2956	// No need to update map as each call to save_live_registers will produce identical oopmap
2957	(void) RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words);
2958
2959	__ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2960	__ jmp(cont);
2961
2962	#if INCLUDE_JVMCI
2963	Label after_fetch_unroll_info_call;
2964	int implicit_exception_uncommon_trap_offset = `0`;
2965	int uncommon_trap_offset = `0`;
2966
2967	if (EnableJVMCI \|\| UseAOT) {
2968	implicit_exception_uncommon_trap_offset = __ pc() - start;
2969
2970	__ pushptr(Address (r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2971	__ movptr(Address (r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2972
2973	uncommon_trap_offset = __ pc() - start;
2974
2975	// Save everything in sight.
2976	RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words);
2977	// fetch_unroll_info needs to call last_java_frame()
2978	__ set_last_Java_frame(noreg, noreg, NULL);
2979
2980	__ movl(c_rarg1, Address (r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2981	__ movl(Address (r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -`1`);
2982
2983	__ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2984	__ mov(c_rarg0, r15_thread);
2985	__ movl(c_rarg2, r14); // exec mode
2986	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2987	oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2988
2989	__ reset_last_Java_frame(false);
2990
2991	__ jmp(after_fetch_unroll_info_call);
2992	} // EnableJVMCI
2993	#endif // INCLUDE_JVMCI
2994
2995	int exception_offset = __ pc() - start;
2996
2997	// Prolog for exception case
2998
2999	// all registers are dead at this entry point, except for rax, and
3000	// rdx which contain the exception oop and exception pc
3001	// respectively. Set them in TLS and fall thru to the
3002	// unpack_with_exception_in_tls entry point.
3003
3004	__ movptr(Address (r15_thread, JavaThread::exception_pc_offset()), rdx);
3005	__ movptr(Address (r15_thread, JavaThread::exception_oop_offset()), rax);
3006
3007	int exception_in_tls_offset = __ pc() - start;
3008
3009	// new implementation because exception oop is now passed in JavaThread
3010
3011	// Prolog for exception case
3012	// All registers must be preserved because they might be used by LinearScan
3013	// Exceptiop oop and throwing PC are passed in JavaThread
3014	// tos: stack at point of call to method that threw the exception (i.e. only
3015	// args are on the stack, no return address)
3016
3017	// make room on stack for the return address
3018	// It will be patched later with the throwing pc. The correct value is not
3019	// available now because loading it from memory would destroy registers.
3020	__ push(`0`);
3021
3022	// Save everything in sight.
3023	map = RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words);
3024
3025	// Now it is safe to overwrite any register
3026
3027	// Deopt during an exception. Save exec mode for unpack_frames.
3028	__ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3029
3030	// load throwing pc from JavaThread and patch it as the return address
3031	// of the current frame. Then clear the field in JavaThread
3032
3033	__ movptr(rdx, Address (r15_thread, JavaThread::exception_pc_offset()));
3034	__ movptr(Address (rbp, wordSize), rdx);
3035	__ movptr(Address (r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
3036
3037	#ifdef ASSERT
3038	// verify that there is really an exception oop in JavaThread
3039	__ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3040	__ verify_oop(rax);
3041
3042	// verify that there is no pending exception
3043	Label no_pending_exception;
3044	__ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3045	__ testptr(rax, rax);
3046	__ jcc(Assembler::zero, no_pending_exception);
3047	__ stop("must not have pending exception here");
3048	__ bind(no_pending_exception);
3049	#endif
3050
3051	__ bind(cont);
3052
3053	// Call C code. Need thread and this frame, but NOT official VM entry
3054	// crud. We cannot block on this call, no GC can happen.
3055	//
3056	// UnrollBlock fetch_unroll_info(JavaThread* thread)*
3057
3058	// fetch_unroll_info needs to call last_java_frame().
3059
3060	__ set_last_Java_frame(noreg, noreg, NULL);
3061	#ifdef ASSERT
3062	{ Label L;
3063	__ cmpptr(Address(r15_thread,
3064	JavaThread::last_Java_fp_offset()),
3065	(int32_t)`0`);
3066	__ jcc(Assembler::equal, L);
3067	__ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3068	__ bind(L);
3069	}
3070	#endif // ASSERT
3071	__ mov(c_rarg0, r15_thread);
3072	__ movl(c_rarg1, r14); // exec_mode
3073	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3074
3075	// Need to have an oopmap that tells fetch_unroll_info where to
3076	// find any register it might need.
3077	oop_maps->add_gc_map(__ pc() - start, map);
3078
3079	__ reset_last_Java_frame(false);
3080
3081	#if INCLUDE_JVMCI
3082	if (EnableJVMCI \|\| UseAOT) {
3083	__ bind(after_fetch_unroll_info_call);
3084	}
3085	#endif
3086
3087	// Load UnrollBlock into rdi*
3088	__ mov(rdi, rax);
3089
3090	__ movl(r14, Address (rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
3091	Label noException;
3092	__ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
3093	__ jcc(Assembler::notEqual, noException);
3094	__ movptr(rax, Address (r15_thread, JavaThread::exception_oop_offset()));
3095	// QQQ this is useless it was NULL above
3096	__ movptr(rdx, Address (r15_thread, JavaThread::exception_pc_offset()));
3097	__ movptr(Address (r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
3098	__ movptr(Address (r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
3099
3100	__ verify_oop(rax);
3101
3102	// Overwrite the result registers with the exception results.
3103	__ movptr(Address (rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3104	// I think this is useless
3105	__ movptr(Address (rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3106
3107	__ bind(noException);
3108
3109	// Only register save data is on the stack.
3110	// Now restore the result registers. Everything else is either dead
3111	// or captured in the vframeArray.
3112	RegisterSaver::restore_result_registers(masm);
3113
3114	// All of the register save area has been popped of the stack. Only the
3115	// return address remains.
3116
3117	// Pop all the frames we must move/replace.
3118	//
3119	// Frame picture (youngest to oldest)
3120	// 1: self-frame (no frame link)
3121	// 2: deopting frame (no frame link)
3122	// 3: caller of deopting frame (could be compiled/interpreted).
3123	//
3124	// Note: by leaving the return address of self-frame on the stack
3125	// and using the size of frame 2 to adjust the stack
3126	// when we are done the return to frame 3 will still be on the stack.
3127
3128	// Pop deoptimized frame
3129	__ movl(rcx, Address (rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
3130	__ addptr(rsp, rcx);
3131
3132	// rsp should be pointing at the return address to the caller (3)
3133
3134	// Pick up the initial fp we should save
3135	// restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3136	__ movptr(rbp, Address (rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3137
3138	#ifdef ASSERT
3139	// Compilers generate code that bang the stack by as much as the
3140	// interpreter would need. So this stack banging should never
3141	// trigger a fault. Verify that it does not on non product builds.
3142	if (UseStackBanging) {
3143	__ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3144	__ bang_stack_size(rbx, rcx);
3145	}
3146	#endif
3147
3148	// Load address of array of frame pcs into rcx
3149	__ movptr(rcx, Address (rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3150
3151	// Trash the old pc
3152	__ addptr(rsp, wordSize);
3153
3154	// Load address of array of frame sizes into rsi
3155	__ movptr(rsi, Address (rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
3156
3157	// Load counter into rdx
3158	__ movl(rdx, Address (rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
3159
3160	// Now adjust the caller's stack to make up for the extra locals
3161	// but record the original sp so that we can save it in the skeletal interpreter
3162	// frame and the stack walking of interpreter_sender will get the unextended sp
3163	// value and not the "real" sp value.
3164
3165	const Register sender_sp = r8;
3166
3167	__ mov(sender_sp, rsp);
3168	__ movl(rbx, Address (rdi,
3169	Deoptimization::UnrollBlock::
3170	caller_adjustment_offset_in_bytes()));
3171	__ subptr(rsp, rbx);
3172
3173	// Push interpreter frames in a loop
3174	Label loop;
3175	__ bind(loop);
3176	__ movptr(rbx, Address (rsi, `0`)); // Load frame size
3177	__ subptr(rbx, `2`wordSize); // We'll push pc and ebp by hand*
3178	__ pushptr(Address (rcx, `0`)); // Save return address
3179	__ enter(); // Save old & set new ebp
3180	__ subptr(rsp, rbx); // Prolog
3181	// This value is corrected by layout_activation_impl
3182	__ movptr(Address (rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3183	__ movptr(Address (rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3184	__ mov(sender_sp, rsp); // Pass sender_sp to next frame
3185	__ addptr(rsi, wordSize); // Bump array pointer (sizes)
3186	__ addptr(rcx, wordSize); // Bump array pointer (pcs)
3187	__ decrementl(rdx); // Decrement counter
3188	__ jcc(Assembler::notZero, loop);
3189	__ pushptr(Address (rcx, `0`)); // Save final return address
3190
3191	// Re-push self-frame
3192	__ enter(); // Save old & set new ebp
3193
3194	// Allocate a full sized register save area.
3195	// Return address and rbp are in place, so we allocate two less words.
3196	__ subptr(rsp, (frame_size_in_words - `2`) * wordSize);
3197
3198	// Restore frame locals after moving the frame
3199	__ movdbl(Address (rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3200	__ movptr(Address (rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3201
3202	// Call C code. Need thread but NOT official VM entry
3203	// crud. We cannot block on this call, no GC can happen. Call should
3204	// restore return values to their stack-slots with the new SP.
3205	//
3206	// void Deoptimization::unpack_frames(JavaThread thread, int exec_mode)*
3207
3208	// Use rbp because the frames look interpreted now
3209	// Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3210	// Don't need the precise return PC here, just precise enough to point into this code blob.
3211	address the_pc = __ pc();
3212	__ set_last_Java_frame(noreg, rbp, the_pc);
3213
3214	__ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
3215	__ mov(c_rarg0, r15_thread);
3216	__ movl(c_rarg1, r14); // second arg: exec_mode
3217	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3218	// Revert SP alignment after call since we're going to do some SP relative addressing below
3219	__ movptr(rsp, Address (r15_thread, JavaThread::last_Java_sp_offset()));
3220
3221	// Set an oopmap for the call site
3222	// Use the same PC we used for the last java frame
3223	oop_maps->add_gc_map(the_pc - start,
3224	new OopMap ( frame_size_in_words, `0` ));
3225
3226	// Clear fp AND pc
3227	__ reset_last_Java_frame(true);
3228
3229	// Collect return values
3230	__ movdbl(xmm0, Address (rsp, RegisterSaver::xmm0_offset_in_bytes()));
3231	__ movptr(rax, Address (rsp, RegisterSaver::rax_offset_in_bytes()));
3232	// I think this is useless (throwing pc?)
3233	__ movptr(rdx, Address (rsp, RegisterSaver::rdx_offset_in_bytes()));
3234
3235	// Pop self-frame.
3236	__ leave(); // Epilog
3237
3238	// Jump to interpreter
3239	__ ret(`0`);
3240
3241	// Make sure all code is generated
3242	masm->flush();
3243
3244	_deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, `0`, exception_offset, reexecute_offset, frame_size_in_words);
3245	_deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3246	#if INCLUDE_JVMCI
3247	if (EnableJVMCI \|\| UseAOT) {
3248	_deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3249	_deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3250	}
3251	#endif
3252	}
3253
3254	#ifdef COMPILER2
3255	//------------------------------generate_uncommon_trap_blob--------------------
3256	void SharedRuntime::generate_uncommon_trap_blob() {
3257	// Allocate space for the code
3258	ResourceMark rm;
3259	// Setup code generation tools
3260	CodeBuffer buffer("uncommon_trap_blob", `2048`, `1024`);
3261	MacroAssembler* masm = new MacroAssembler (&buffer);
3262
3263	assert(SimpleRuntimeFrame::framesize % `4` == `0`, "sp not 16-byte aligned");
3264
3265	address start = __ pc();
3266
3267	if (UseRTMLocking) {
3268	// Abort RTM transaction before possible nmethod deoptimization.
3269	__ xabort(`0`);
3270	}
3271
3272	// Push self-frame. We get here with a return address on the
3273	// stack, so rsp is 8-byte aligned until we allocate our frame.
3274	__ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3275
3276	// No callee saved registers. rbp is assumed implicitly saved
3277	__ movptr(Address (rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3278
3279	// compiler left unloaded_class_index in j_rarg0 move to where the
3280	// runtime expects it.
3281	__ movl(c_rarg1, j_rarg0);
3282
3283	__ set_last_Java_frame(noreg, noreg, NULL);
3284
3285	// Call C code. Need thread but NOT official VM entry
3286	// crud. We cannot block on this call, no GC can happen. Call should
3287	// capture callee-saved registers as well as return values.
3288	// Thread is in rdi already.
3289	//
3290	// UnrollBlock uncommon_trap(JavaThread* thread, jint unloaded_class_index);*
3291
3292	__ mov(c_rarg0, r15_thread);
3293	__ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3294	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3295
3296	// Set an oopmap for the call site
3297	OopMapSet* oop_maps = new OopMapSet ();
3298	OopMap* map = new OopMap (SimpleRuntimeFrame::framesize, `0`);
3299
3300	// location of rbp is known implicitly by the frame sender code
3301
3302	oop_maps->add_gc_map(__ pc() - start, map);
3303
3304	__ reset_last_Java_frame(false);
3305
3306	// Load UnrollBlock into rdi*
3307	__ mov(rdi, rax);
3308
3309	#ifdef ASSERT
3310	{ Label L;
3311	__ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3312	(int32_t)Deoptimization::Unpack_uncommon_trap);
3313	__ jcc(Assembler::equal, L);
3314	__ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
3315	__ bind(L);
3316	}
3317	#endif
3318
3319	// Pop all the frames we must move/replace.
3320	//
3321	// Frame picture (youngest to oldest)
3322	// 1: self-frame (no frame link)
3323	// 2: deopting frame (no frame link)
3324	// 3: caller of deopting frame (could be compiled/interpreted).
3325
3326	// Pop self-frame. We have no frame, and must rely only on rax and rsp.
3327	__ addptr(rsp, (SimpleRuntimeFrame::framesize - `2`) << LogBytesPerInt); // Epilog!
3328
3329	// Pop deoptimized frame (int)
3330	__ movl(rcx, Address (rdi,
3331	Deoptimization::UnrollBlock::
3332	size_of_deoptimized_frame_offset_in_bytes()));
3333	__ addptr(rsp, rcx);
3334
3335	// rsp should be pointing at the return address to the caller (3)
3336
3337	// Pick up the initial fp we should save
3338	// restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3339	__ movptr(rbp, Address (rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3340
3341	#ifdef ASSERT
3342	// Compilers generate code that bang the stack by as much as the
3343	// interpreter would need. So this stack banging should never
3344	// trigger a fault. Verify that it does not on non product builds.
3345	if (UseStackBanging) {
3346	__ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3347	__ bang_stack_size(rbx, rcx);
3348	}
3349	#endif
3350
3351	// Load address of array of frame pcs into rcx (address)*
3352	__ movptr(rcx, Address (rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3353
3354	// Trash the return pc
3355	__ addptr(rsp, wordSize);
3356
3357	// Load address of array of frame sizes into rsi (intptr_t)*
3358	__ movptr(rsi, Address (rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3359
3360	// Counter
3361	__ movl(rdx, Address (rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3362
3363	// Now adjust the caller's stack to make up for the extra locals but
3364	// record the original sp so that we can save it in the skeletal
3365	// interpreter frame and the stack walking of interpreter_sender
3366	// will get the unextended sp value and not the "real" sp value.
3367
3368	const Register sender_sp = r8;
3369
3370	__ mov(sender_sp, rsp);
3371	__ movl(rbx, Address (rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3372	__ subptr(rsp, rbx);
3373
3374	// Push interpreter frames in a loop
3375	Label loop;
3376	__ bind(loop);
3377	__ movptr(rbx, Address (rsi, `0`)); // Load frame size
3378	__ subptr(rbx, `2` * wordSize); // We'll push pc and rbp by hand
3379	__ pushptr(Address (rcx, `0`)); // Save return address
3380	__ enter(); // Save old & set new rbp
3381	__ subptr(rsp, rbx); // Prolog
3382	__ movptr(Address (rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3383	sender_sp); // Make it walkable
3384	// This value is corrected by layout_activation_impl
3385	__ movptr(Address (rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3386	__ mov(sender_sp, rsp); // Pass sender_sp to next frame
3387	__ addptr(rsi, wordSize); // Bump array pointer (sizes)
3388	__ addptr(rcx, wordSize); // Bump array pointer (pcs)
3389	__ decrementl(rdx); // Decrement counter
3390	__ jcc(Assembler::notZero, loop);
3391	__ pushptr(Address (rcx, `0`)); // Save final return address
3392
3393	// Re-push self-frame
3394	__ enter(); // Save old & set new rbp
3395	__ subptr(rsp, (SimpleRuntimeFrame::framesize - `4`) << LogBytesPerInt);
3396	// Prolog
3397
3398	// Use rbp because the frames look interpreted now
3399	// Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3400	// Don't need the precise return PC here, just precise enough to point into this code blob.
3401	address the_pc = __ pc();
3402	__ set_last_Java_frame(noreg, rbp, the_pc);
3403
3404	// Call C code. Need thread but NOT official VM entry
3405	// crud. We cannot block on this call, no GC can happen. Call should
3406	// restore return values to their stack-slots with the new SP.
3407	// Thread is in rdi already.
3408	//
3409	// BasicType unpack_frames(JavaThread thread, int exec_mode);*
3410
3411	__ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3412	__ mov(c_rarg0, r15_thread);
3413	__ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3414	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3415
3416	// Set an oopmap for the call site
3417	// Use the same PC we used for the last java frame
3418	oop_maps->add_gc_map(the_pc - start, new OopMap (SimpleRuntimeFrame::framesize, `0`));
3419
3420	// Clear fp AND pc
3421	__ reset_last_Java_frame(true);
3422
3423	// Pop self-frame.
3424	__ leave(); // Epilog
3425
3426	// Jump to interpreter
3427	__ ret(`0`);
3428
3429	// Make sure all code is generated
3430	masm->flush();
3431
3432	_uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps,
3433	SimpleRuntimeFrame::framesize >> `1`);
3434	}
3435	#endif // COMPILER2
3436
3437
3438	//------------------------------generate_handler_blob------
3439	//
3440	// Generate a special Compile2Runtime blob that saves all registers,
3441	// and setup oopmap.
3442	//
3443	SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3444	assert(StubRoutines::forward_exception_entry() != NULL,
3445	"must be generated before");
3446
3447	ResourceMark rm;
3448	OopMapSet oop_maps = new* OopMapSet ();
3449	OopMap* map;
3450
3451	// Allocate space for the code. Setup code generation tools.
3452	CodeBuffer buffer("handler_blob", `2048`, `1024`);
3453	MacroAssembler* masm = new MacroAssembler (&buffer);
3454
3455	address start = __ pc();
3456	address call_pc = NULL;
3457	int frame_size_in_words;
3458	bool cause_return = (poll_type == POLL_AT_RETURN);
3459	bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3460
3461	if (UseRTMLocking) {
3462	// Abort RTM transaction before calling runtime
3463	// because critical section will be large and will be
3464	// aborted anyway. Also nmethod could be deoptimized.
3465	__ xabort(`0`);
3466	}
3467
3468	// Make room for return address (or push it again)
3469	if (!cause_return) {
3470	__ push(rbx);
3471	}
3472
3473	// Save registers, fpu state, and flags
3474	map = RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words, save_vectors);
3475
3476	// The following is basically a call_VM. However, we need the precise
3477	// address of the call in order to generate an oopmap. Hence, we do all the
3478	// work outselves.
3479
3480	__ set_last_Java_frame(noreg, noreg, NULL);
3481
3482	// The return address must always be correct so that frame constructor never
3483	// sees an invalid pc.
3484
3485	if (!cause_return) {
3486	// Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3487	// Additionally, rbx is a callee saved register and we can look at it later to determine
3488	// if someone changed the return address for us!
3489	__ movptr(rbx, Address (r15_thread, JavaThread::saved_exception_pc_offset()));
3490	__ movptr(Address (rbp, wordSize), rbx);
3491	}
3492
3493	// Do the call
3494	__ mov(c_rarg0, r15_thread);
3495	__ call(RuntimeAddress (call_ptr));
3496
3497	// Set an oopmap for the call site. This oopmap will map all
3498	// oop-registers and debug-info registers as callee-saved. This
3499	// will allow deoptimization at this safepoint to find all possible
3500	// debug-info recordings, as well as let GC find all oops.
3501
3502	oop_maps->add_gc_map( __ pc() - start, map);
3503
3504	Label noException;
3505
3506	__ reset_last_Java_frame(false);
3507
3508	__ cmpptr(Address (r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3509	__ jcc(Assembler::equal, noException);
3510
3511	// Exception pending
3512
3513	RegisterSaver::restore_live_registers(masm, save_vectors);
3514
3515	__ jump(RuntimeAddress (StubRoutines::forward_exception_entry()));
3516
3517	// No exception case
3518	__ bind(noException);
3519
3520	Label no_adjust;
3521	#ifdef ASSERT
3522	Label bail;
3523	#endif
3524	if (SafepointMechanism::uses_thread_local_poll() && !cause_return) {
3525	Label no_prefix, not_special;
3526
3527	// If our stashed return pc was modified by the runtime we avoid touching it
3528	__ cmpptr(rbx, Address (rbp, wordSize));
3529	__ jccb(Assembler::notEqual, no_adjust);
3530
3531	// Skip over the poll instruction.
3532	// See NativeInstruction::is_safepoint_poll()
3533	// Possible encodings:
3534	// 85 00 test %eax,(%rax)
3535	// 85 01 test %eax,(%rcx)
3536	// 85 02 test %eax,(%rdx)
3537	// 85 03 test %eax,(%rbx)
3538	// 85 06 test %eax,(%rsi)
3539	// 85 07 test %eax,(%rdi)
3540	//
3541	// 41 85 00 test %eax,(%r8)
3542	// 41 85 01 test %eax,(%r9)
3543	// 41 85 02 test %eax,(%r10)
3544	// 41 85 03 test %eax,(%r11)
3545	// 41 85 06 test %eax,(%r14)
3546	// 41 85 07 test %eax,(%r15)
3547	//
3548	// 85 04 24 test %eax,(%rsp)
3549	// 41 85 04 24 test %eax,(%r12)
3550	// 85 45 00 test %eax,0x0(%rbp)
3551	// 41 85 45 00 test %eax,0x0(%r13)
3552
3553	__ cmpb(Address (rbx, `0`), NativeTstRegMem::instruction_rex_b_prefix);
3554	__ jcc(Assembler::notEqual, no_prefix);
3555	__ addptr(rbx, `1`);
3556	__ bind(no_prefix);
3557	#ifdef ASSERT
3558	__ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3559	#endif
3560	// r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3561	// r12/rsp 0x04
3562	// r13/rbp 0x05
3563	__ movzbq(rcx, Address (rbx, `1`));
3564	__ andptr(rcx, `0x07`); // looking for 0x04 .. 0x05
3565	__ subptr(rcx, `4`); // looking for 0x00 .. 0x01
3566	__ cmpptr(rcx, `1`);
3567	__ jcc(Assembler::above, not_special);
3568	__ addptr(rbx, `1`);
3569	__ bind(not_special);
3570	#ifdef ASSERT
3571	// Verify the correct encoding of the poll we're about to skip.
3572	__ cmpb(Address(rax, `0`), NativeTstRegMem::instruction_code_memXregl);
3573	__ jcc(Assembler::notEqual, bail);
3574	// Mask out the modrm bits
3575	__ testb(Address(rax, `1`), NativeTstRegMem::modrm_mask);
3576	// rax encodes to 0, so if the bits are nonzero it's incorrect
3577	__ jcc(Assembler::notZero, bail);
3578	#endif
3579	// Adjust return pc forward to step over the safepoint poll instruction
3580	__ addptr(rbx, `2`);
3581	__ movptr(Address (rbp, wordSize), rbx);
3582	}
3583
3584	__ bind(no_adjust);
3585	// Normal exit, restore registers and exit.
3586	RegisterSaver::restore_live_registers(masm, save_vectors);
3587	__ ret(`0`);
3588
3589	#ifdef ASSERT
3590	__ bind(bail);
3591	__ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3592	#endif
3593
3594	// Make sure all code is generated
3595	masm->flush();
3596
3597	// Fill-out other meta info
3598	return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3599	}
3600
3601	//
3602	// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3603	//
3604	// Generate a stub that calls into vm to find out the proper destination
3605	// of a java call. All the argument registers are live at this point
3606	// but since this is generic code we don't know what they are and the caller
3607	// must do any gc of the args.
3608	//
3609	RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3610	assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3611
3612	// allocate space for the code
3613	ResourceMark rm;
3614
3615	CodeBuffer buffer(name, `1000`, `512`);
3616	MacroAssembler* masm = new MacroAssembler (&buffer);
3617
3618	int frame_size_in_words;
3619
3620	OopMapSet oop_maps = new* OopMapSet ();
3621	OopMap* map = NULL;
3622
3623	int start = __ offset();
3624
3625	map = RegisterSaver::save_live_registers(masm, `0`, &frame_size_in_words);
3626
3627	int frame_complete = __ offset();
3628
3629	__ set_last_Java_frame(noreg, noreg, NULL);
3630
3631	__ mov(c_rarg0, r15_thread);
3632
3633	__ call(RuntimeAddress (destination));
3634
3635
3636	// Set an oopmap for the call site.
3637	// We need this not only for callee-saved registers, but also for volatile
3638	// registers that the compiler might be keeping live across a safepoint.
3639
3640	oop_maps->add_gc_map( __ offset() - start, map);
3641
3642	// rax contains the address we are going to jump to assuming no exception got installed
3643
3644	// clear last_Java_sp
3645	__ reset_last_Java_frame(false);
3646	// check for pending exceptions
3647	Label pending;
3648	__ cmpptr(Address (r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3649	__ jcc(Assembler::notEqual, pending);
3650
3651	// get the returned Method*
3652	__ get_vm_result_2(rbx, r15_thread);
3653	__ movptr(Address (rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3654
3655	__ movptr(Address (rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3656
3657	RegisterSaver::restore_live_registers(masm);
3658
3659	// We are back the the original state on entry and ready to go.
3660
3661	__ jmp(rax);
3662
3663	// Pending exception after the safepoint
3664
3665	__ bind(pending);
3666
3667	RegisterSaver::restore_live_registers(masm);
3668
3669	// exception pending => remove activation and forward to exception handler
3670
3671	__ movptr(Address (r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3672
3673	__ movptr(rax, Address (r15_thread, Thread::pending_exception_offset()));
3674	__ jump(RuntimeAddress (StubRoutines::forward_exception_entry()));
3675
3676	// -------------
3677	// make sure all code is generated
3678	masm->flush();
3679
3680	// return the blob
3681	// frame_size_words or bytes??
3682	return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3683	}
3684
3685
3686	//------------------------------Montgomery multiplication------------------------
3687	//
3688
3689	#ifndef _WINDOWS
3690
3691	#define ASM_SUBTRACT
3692
3693	#ifdef ASM_SUBTRACT
3694	// Subtract 0:b from carry:a. Return carry.
3695	static unsigned long
3696	sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3697	long i = `0`, cnt = len;
3698	unsigned long tmp;
3699	asm volatile("clc; "
3700	"0: ; "
3701	"mov (%[b], %[i], 8), %[tmp]; "
3702	"sbb %[tmp], (%[a], %[i], 8); "
3703	"inc %[i]; dec %[cnt]; "
3704	"jne 0b; "
3705	"mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3706	: [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3707	: [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3708	: "memory");
3709	return tmp;
3710	}
3711	#else // ASM_SUBTRACT
3712	typedef int __attribute__((mode(TI))) int128;
3713
3714	// Subtract 0:b from carry:a. Return carry.
3715	static unsigned long
3716	sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3717	int128 tmp = `0`;
3718	int i;
3719	for (i = `0`; i < len; i++) {
3720	tmp += a[i];
3721	tmp -= b[i];
3722	a[i] = tmp;
3723	tmp >>= `64`;
3724	assert(-`1` <= tmp && tmp <= `0`, "invariant");
3725	}
3726	return tmp + carry;
3727	}
3728	#endif // ! ASM_SUBTRACT
3729
3730	// Multiply (unsigned) Long A by Long B, accumulating the double-
3731	// length result into the accumulator formed of T0, T1, and T2.
3732	#define MACC(A, B, T0, T1, T2) \
3733	do { \
3734	unsigned long hi, lo; \
3735	__asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3736	: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3737	: "r"(A), "a"(B) : "cc"); \
3738	} while(0)
3739
3740	// As above, but add twice the double-length result into the
3741	// accumulator.
3742	#define MACC2(A, B, T0, T1, T2) \
3743	do { \
3744	unsigned long hi, lo; \
3745	__asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3746	"add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3747	: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3748	: "r"(A), "a"(B) : "cc"); \
3749	} while(0)
3750
3751	// Fast Montgomery multiplication. The derivation of the algorithm is
3752	// in A Cryptographic Library for the Motorola DSP56000,
3753	// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3754
3755	static void __attribute__((noinline))
3756	montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3757	unsigned long m[], unsigned long inv, int len) {
3758	unsigned long t0 = `0`, t1 = `0`, t2 = `0`; // Triple-precision accumulator
3759	int i;
3760
3761	assert(inv * n[`0`] == -`1UL`, "broken inverse in Montgomery multiply");
3762
3763	for (i = `0`; i < len; i++) {
3764	int j;
3765	for (j = `0`; j < i; j++) {
3766	MACC(a[j], b[i-j], t0, t1, t2);
3767	MACC(m[j], n[i-j], t0, t1, t2);
3768	}
3769	MACC(a[i], b[`0`], t0, t1, t2);
3770	m[i] = t0 * inv;
3771	MACC(m[i], n[`0`], t0, t1, t2);
3772
3773	assert(t0 == `0`, "broken Montgomery multiply");
3774
3775	t0 = t1; t1 = t2; t2 = `0`;
3776	}
3777
3778	for (i = len; i < `2`*len; i++) {
3779	int j;
3780	for (j = i-len+`1`; j < len; j++) {
3781	MACC(a[j], b[i-j], t0, t1, t2);
3782	MACC(m[j], n[i-j], t0, t1, t2);
3783	}
3784	m[i-len] = t0;
3785	t0 = t1; t1 = t2; t2 = `0`;
3786	}
3787
3788	while (t0)
3789	t0 = sub(m, n, t0, len);
3790	}
3791
3792	// Fast Montgomery squaring. This uses asymptotically 25% fewer
3793	// multiplies so it should be up to 25% faster than Montgomery
3794	// multiplication. However, its loop control is more complex and it
3795	// may actually run slower on some machines.
3796
3797	static void __attribute__((noinline))
3798	montgomery_square(unsigned long a[], unsigned long n[],
3799	unsigned long m[], unsigned long inv, int len) {
3800	unsigned long t0 = `0`, t1 = `0`, t2 = `0`; // Triple-precision accumulator
3801	int i;
3802
3803	assert(inv * n[`0`] == -`1UL`, "broken inverse in Montgomery multiply");
3804
3805	for (i = `0`; i < len; i++) {
3806	int j;
3807	int end = (i+`1`)/`2`;
3808	for (j = `0`; j < end; j++) {
3809	MACC2(a[j], a[i-j], t0, t1, t2);
3810	MACC(m[j], n[i-j], t0, t1, t2);
3811	}
3812	if ((i & `1`) == `0`) {
3813	MACC(a[j], a[j], t0, t1, t2);
3814	}
3815	for (; j < i; j++) {
3816	MACC(m[j], n[i-j], t0, t1, t2);
3817	}
3818	m[i] = t0 * inv;
3819	MACC(m[i], n[`0`], t0, t1, t2);
3820
3821	assert(t0 == `0`, "broken Montgomery square");
3822
3823	t0 = t1; t1 = t2; t2 = `0`;
3824	}
3825
3826	for (i = len; i < `2`*len; i++) {
3827	int start = i-len+`1`;
3828	int end = start + (len - start)/`2`;
3829	int j;
3830	for (j = start; j < end; j++) {
3831	MACC2(a[j], a[i-j], t0, t1, t2);
3832	MACC(m[j], n[i-j], t0, t1, t2);
3833	}
3834	if ((i & `1`) == `0`) {
3835	MACC(a[j], a[j], t0, t1, t2);
3836	}
3837	for (; j < len; j++) {
3838	MACC(m[j], n[i-j], t0, t1, t2);
3839	}
3840	m[i-len] = t0;
3841	t0 = t1; t1 = t2; t2 = `0`;
3842	}
3843
3844	while (t0)
3845	t0 = sub(m, n, t0, len);
3846	}
3847
3848	// Swap words in a longword.
3849	static unsigned long swap(unsigned long x) {
3850	return (x << `32`) \| (x >> `32`);
3851	}
3852
3853	// Copy len longwords from s to d, word-swapping as we go. The
3854	// destination array is reversed.
3855	static void reverse_words(unsigned long s, unsigned* long d, int* len) {
3856	d += len;
3857	while(len-- > `0`) {
3858	d--;
3859	d = swap(s);
3860	s++;
3861	}
3862	}
3863
3864	// The threshold at which squaring is advantageous was determined
3865	// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3866	#define MONTGOMERY_SQUARING_THRESHOLD 64
3867
3868	void SharedRuntime::montgomery_multiply(jint a_ints, jint b_ints, jint *n_ints,
3869	jint len, jlong inv,
3870	jint *m_ints) {
3871	assert(len % `2` == `0`, "array length in montgomery_multiply must be even");
3872	int longwords = len/`2`;
3873
3874	// Make very sure we don't use so much space that the stack might
3875	// overflow. 512 jints corresponds to an 16384-bit integer and
3876	// will use here a total of 8k bytes of stack space.
3877	int total_allocation = longwords * sizeof (unsigned long) * `4`;
3878	guarantee(total_allocation <= `8192`, "must be");
3879	unsigned long scratch = (unsigned* long *)alloca(total_allocation);
3880
3881	// Local scratch arrays
3882	unsigned long
3883	a = scratch + `0` longwords,
3884	b = scratch + `1` longwords,
3885	n = scratch + `2` longwords,
3886	m = scratch + `3` longwords;
3887
3888	reverse_words((unsigned long *)a_ints, a, longwords);
3889	reverse_words((unsigned long *)b_ints, b, longwords);
3890	reverse_words((unsigned long *)n_ints, n, longwords);
3891
3892	::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3893
3894	reverse_words(m, (unsigned long *)m_ints, longwords);
3895	}
3896
3897	void SharedRuntime::montgomery_square(jint a_ints, jint n_ints,
3898	jint len, jlong inv,
3899	jint *m_ints) {
3900	assert(len % `2` == `0`, "array length in montgomery_square must be even");
3901	int longwords = len/`2`;
3902
3903	// Make very sure we don't use so much space that the stack might
3904	// overflow. 512 jints corresponds to an 16384-bit integer and
3905	// will use here a total of 6k bytes of stack space.
3906	int total_allocation = longwords * sizeof (unsigned long) * `3`;
3907	guarantee(total_allocation <= `8192`, "must be");
3908	unsigned long scratch = (unsigned* long *)alloca(total_allocation);
3909
3910	// Local scratch arrays
3911	unsigned long
3912	a = scratch + `0` longwords,
3913	n = scratch + `1` longwords,
3914	m = scratch + `2` longwords;
3915
3916	reverse_words((unsigned long *)a_ints, a, longwords);
3917	reverse_words((unsigned long *)n_ints, n, longwords);
3918
3919	if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3920	::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3921	} else {
3922	::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3923	}
3924
3925	reverse_words(m, (unsigned long *)m_ints, longwords);
3926	}
3927
3928	#endif // WINDOWS
3929
3930	#ifdef COMPILER2
3931	// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3932	//
3933	//------------------------------generate_exception_blob---------------------------
3934	// creates exception blob at the end
3935	// Using exception blob, this code is jumped from a compiled method.
3936	// (see emit_exception_handler in x86_64.ad file)
3937	//
3938	// Given an exception pc at a call we call into the runtime for the
3939	// handler in this method. This handler might merely restore state
3940	// (i.e. callee save registers) unwind the frame and jump to the
3941	// exception handler for the nmethod if there is no Java level handler
3942	// for the nmethod.
3943	//
3944	// This code is entered with a jmp.
3945	//
3946	// Arguments:
3947	// rax: exception oop
3948	// rdx: exception pc
3949	//
3950	// Results:
3951	// rax: exception oop
3952	// rdx: exception pc in caller or ???
3953	// destination: exception handler of caller
3954	//
3955	// Note: the exception pc MUST be at a call (precise debug information)
3956	// Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3957	//
3958
3959	void OptoRuntime::generate_exception_blob() {
3960	assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3961	assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3962	assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3963
3964	assert(SimpleRuntimeFrame::framesize % `4` == `0`, "sp not 16-byte aligned");
3965
3966	// Allocate space for the code
3967	ResourceMark rm;
3968	// Setup code generation tools
3969	CodeBuffer buffer("exception_blob", `2048`, `1024`);
3970	MacroAssembler* masm = new MacroAssembler (&buffer);
3971
3972
3973	address start = __ pc();
3974
3975	// Exception pc is 'return address' for stack walker
3976	__ push(rdx);
3977	__ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3978
3979	// Save callee-saved registers. See x86_64.ad.
3980
3981	// rbp is an implicitly saved callee saved register (i.e., the calling
3982	// convention will save/restore it in the prolog/epilog). Other than that
3983	// there are no callee save registers now that adapter frames are gone.
3984
3985	__ movptr(Address (rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3986
3987	// Store exception in Thread object. We cannot pass any arguments to the
3988	// handle_exception call, since we do not want to make any assumption
3989	// about the size of the frame where the exception happened in.
3990	// c_rarg0 is either rdi (Linux) or rcx (Windows).
3991	__ movptr(Address (r15_thread, JavaThread::exception_oop_offset()),rax);
3992	__ movptr(Address (r15_thread, JavaThread::exception_pc_offset()), rdx);
3993
3994	// This call does all the hard work. It checks if an exception handler
3995	// exists in the method.
3996	// If so, it returns the handler address.
3997	// If not, it prepares for stack-unwinding, restoring the callee-save
3998	// registers of the frame being removed.
3999	//
4000	// address OptoRuntime::handle_exception_C(JavaThread thread)*
4001
4002	// At a method handle call, the stack may not be properly aligned
4003	// when returning with an exception.
4004	address the_pc = __ pc();
4005	__ set_last_Java_frame(noreg, noreg, the_pc);
4006	__ mov(c_rarg0, r15_thread);
4007	__ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
4008	__ call(RuntimeAddress (CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
4009
4010	// Set an oopmap for the call site. This oopmap will only be used if we
4011	// are unwinding the stack. Hence, all locations will be dead.
4012	// Callee-saved registers will be the same as the frame above (i.e.,
4013	// handle_exception_stub), since they were restored when we got the
4014	// exception.
4015
4016	OopMapSet* oop_maps = new OopMapSet ();
4017
4018	oop_maps->add_gc_map(the_pc - start, new OopMap (SimpleRuntimeFrame::framesize, `0`));
4019
4020	__ reset_last_Java_frame(false);
4021
4022	// Restore callee-saved registers
4023
4024	// rbp is an implicitly saved callee-saved register (i.e., the calling
4025	// convention will save restore it in prolog/epilog) Other than that
4026	// there are no callee save registers now that adapter frames are gone.
4027
4028	__ movptr(rbp, Address (rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
4029
4030	__ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
4031	__ pop(rdx); // No need for exception pc anymore
4032
4033	// rax: exception handler
4034
4035	// We have a handler in rax (could be deopt blob).
4036	__ mov(r8, rax);
4037
4038	// Get the exception oop
4039	__ movptr(rax, Address (r15_thread, JavaThread::exception_oop_offset()));
4040	// Get the exception pc in case we are deoptimized
4041	__ movptr(rdx, Address (r15_thread, JavaThread::exception_pc_offset()));
4042	#ifdef ASSERT
4043	__ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
4044	__ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
4045	#endif
4046	// Clear the exception oop so GC no longer processes it as a root.
4047	__ movptr(Address (r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
4048
4049	// rax: exception oop
4050	// r8: exception handler
4051	// rdx: exception pc
4052	// Jump to handler
4053
4054	__ jmp(r8);
4055
4056	// Make sure all code is generated
4057	masm->flush();
4058
4059	// Set exception blob
4060	_exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> `1`);
4061	}
4062	#endif // COMPILER2
4063

Browse the source code of OpenJDK/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp