1 /*
2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #ifndef _WINDOWS
27 #include "alloca.h"
28 #endif
29 #include "asm/macroAssembler.hpp"
30 #include "asm/macroAssembler.inline.hpp"
31 #include "code/debugInfoRec.hpp"
32 #include "code/icBuffer.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/compiledICHolder.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/jniHandles.hpp"
48 #include "runtime/safepointMechanism.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/signature.hpp"
51 #include "runtime/stubRoutines.hpp"
52 #include "runtime/vframeArray.hpp"
53 #include "runtime/vm_version.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/formatBuffer.hpp"
56 #include "vmreg_x86.inline.hpp"
57 #ifdef COMPILER1
58 #include "c1/c1_Runtime1.hpp"
59 #endif
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_JVMCI
64 #include "jvmci/jvmciJavaClasses.hpp"
65 #endif
66
67 #define __ masm->
68
69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
70
71 class SimpleRuntimeFrame {
72
73 public:
74
75 // Most of the runtime stubs have this simple frame layout.
76 // This class exists to make the layout shared in one place.
77 // Offsets are for compiler stack slots, which are jints.
78 enum layout {
79 // The frame sender code expects that rbp will be in the "natural" place and
80 // will override any oopMap setting for it. We must therefore force the layout
81 // so that it agrees with the frame sender code.
82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
83 rbp_off2,
84 return_off, return_off2,
85 framesize
86 };
87 };
88
89 class RegisterSaver {
90 // Capture info about frame layout. Layout offsets are in jint
91 // units because compiler frame slots are jints.
92 #define XSAVE_AREA_BEGIN 160
93 #define XSAVE_AREA_YMM_BEGIN 576
94 #define XSAVE_AREA_OPMASK_BEGIN 1088
95 #define XSAVE_AREA_ZMM_BEGIN 1152
96 #define XSAVE_AREA_UPPERBANK 1664
97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
102 enum layout {
103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
105 DEF_XMM_OFFS(0),
106 DEF_XMM_OFFS(1),
107 // 2..15 are implied in range usage
108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
109 DEF_YMM_OFFS(0),
110 DEF_YMM_OFFS(1),
111 // 2..15 are implied in range usage
112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
113 DEF_OPMASK_OFFS(0),
114 DEF_OPMASK_OFFS(1),
115 // 2..7 are implied in range usage
116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
117 DEF_ZMM_OFFS(0),
118 DEF_ZMM_OFFS(1),
119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
120 DEF_ZMM_UPPER_OFFS(16),
121 DEF_ZMM_UPPER_OFFS(17),
122 // 18..31 are implied in range usage
123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
124 fpu_stateH_end,
125 r15_off, r15H_off,
126 r14_off, r14H_off,
127 r13_off, r13H_off,
128 r12_off, r12H_off,
129 r11_off, r11H_off,
130 r10_off, r10H_off,
131 r9_off, r9H_off,
132 r8_off, r8H_off,
133 rdi_off, rdiH_off,
134 rsi_off, rsiH_off,
135 ignore_off, ignoreH_off, // extra copy of rbp
136 rsp_off, rspH_off,
137 rbx_off, rbxH_off,
138 rdx_off, rdxH_off,
139 rcx_off, rcxH_off,
140 rax_off, raxH_off,
141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
142 align_off, alignH_off,
143 flags_off, flagsH_off,
144 // The frame sender code expects that rbp will be in the "natural" place and
145 // will override any oopMap setting for it. We must therefore force the layout
146 // so that it agrees with the frame sender code.
147 rbp_off, rbpH_off, // copy of rbp we will restore
148 return_off, returnH_off, // slot for return address
149 reg_save_size // size in compiler stack slots
150 };
151
152 public:
153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
155
156 // Offsets into the register save area
157 // Used by deoptimization when it is managing result register
158 // values on its own
159
160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
165
166 // During deoptimization only the result registers need to be restored,
167 // all the other values have already been extracted.
168 static void restore_result_registers(MacroAssembler* masm);
169 };
170
171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
172 int off = 0;
173 int num_xmm_regs = XMMRegisterImpl::number_of_registers;
174 if (UseAVX < 3) {
175 num_xmm_regs = num_xmm_regs/2;
176 }
177 #if COMPILER2_OR_JVMCI
178 if (save_wide_vectors && UseAVX == 0) {
179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
180 }
181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
182 #else
183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
184 #endif
185
186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
190 // CodeBlob frame size is in words.
191 int frame_size_in_words = frame_size_in_bytes / wordSize;
192 *total_frame_words = frame_size_in_words;
193
194 // Save registers, fpu state, and flags.
195 // We assume caller has already pushed the return address onto the
196 // stack, so rsp is 8-byte aligned here.
197 // We push rpb twice in this sequence because we want the real rbp
198 // to be under the return like a normal enter.
199
200 __ enter(); // rsp becomes 16-byte aligned here
201 __ push_CPU_state(); // Push a multiple of 16 bytes
202
203 // push cpu state handles this on EVEX enabled targets
204 if (save_wide_vectors) {
205 // Save upper half of YMM registers(0..15)
206 int base_addr = XSAVE_AREA_YMM_BEGIN;
207 for (int n = 0; n < 16; n++) {
208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
209 }
210 if (VM_Version::supports_evex()) {
211 // Save upper half of ZMM registers(0..15)
212 base_addr = XSAVE_AREA_ZMM_BEGIN;
213 for (int n = 0; n < 16; n++) {
214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
215 }
216 // Save full ZMM registers(16..num_xmm_regs)
217 base_addr = XSAVE_AREA_UPPERBANK;
218 off = 0;
219 int vector_len = Assembler::AVX_512bit;
220 for (int n = 16; n < num_xmm_regs; n++) {
221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
222 }
223 #if COMPILER2_OR_JVMCI
224 base_addr = XSAVE_AREA_OPMASK_BEGIN;
225 off = 0;
226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
228 }
229 #endif
230 }
231 } else {
232 if (VM_Version::supports_evex()) {
233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
234 int base_addr = XSAVE_AREA_UPPERBANK;
235 off = 0;
236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
237 for (int n = 16; n < num_xmm_regs; n++) {
238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
239 }
240 #if COMPILER2_OR_JVMCI
241 base_addr = XSAVE_AREA_OPMASK_BEGIN;
242 off = 0;
243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
245 }
246 #endif
247 }
248 }
249 __ vzeroupper();
250 if (frame::arg_reg_save_area_bytes != 0) {
251 // Allocate argument register save area
252 __ subptr(rsp, frame::arg_reg_save_area_bytes);
253 }
254
255 // Set an oopmap for the call site. This oopmap will map all
256 // oop-registers and debug-info registers as callee-saved. This
257 // will allow deoptimization at this safepoint to find all possible
258 // debug-info recordings, as well as let GC find all oops.
259
260 OopMapSet *oop_maps = new OopMapSet();
261 OopMap* map = new OopMap(frame_size_in_slots, 0);
262
263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
264
265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
269 // rbp location is known implicitly by the frame sender code, needs no oopmap
270 // and the location where rbp was saved by is ignored
271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
282 // on EVEX enabled targets, we get it included in the xsave area
283 off = xmm0_off;
284 int delta = xmm1_off - off;
285 for (int n = 0; n < 16; n++) {
286 XMMRegister xmm_name = as_XMMRegister(n);
287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
288 off += delta;
289 }
290 if (UseAVX > 2) {
291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
292 off = zmm16_off;
293 delta = zmm17_off - off;
294 for (int n = 16; n < num_xmm_regs; n++) {
295 XMMRegister zmm_name = as_XMMRegister(n);
296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
297 off += delta;
298 }
299 }
300
301 #if COMPILER2_OR_JVMCI
302 if (save_wide_vectors) {
303 // Save upper half of YMM registers(0..15)
304 off = ymm0_off;
305 delta = ymm1_off - ymm0_off;
306 for (int n = 0; n < 16; n++) {
307 XMMRegister ymm_name = as_XMMRegister(n);
308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
309 off += delta;
310 }
311 if (VM_Version::supports_evex()) {
312 // Save upper half of ZMM registers(0..15)
313 off = zmm0_off;
314 delta = zmm1_off - zmm0_off;
315 for (int n = 0; n < 16; n++) {
316 XMMRegister zmm_name = as_XMMRegister(n);
317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
318 off += delta;
319 }
320 }
321 }
322 #endif // COMPILER2_OR_JVMCI
323
324 // %%% These should all be a waste but we'll keep things as they were for now
325 if (true) {
326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
330 // rbp location is known implicitly by the frame sender code, needs no oopmap
331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
342 // on EVEX enabled targets, we get it included in the xsave area
343 off = xmm0H_off;
344 delta = xmm1H_off - off;
345 for (int n = 0; n < 16; n++) {
346 XMMRegister xmm_name = as_XMMRegister(n);
347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
348 off += delta;
349 }
350 if (UseAVX > 2) {
351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
352 off = zmm16H_off;
353 delta = zmm17H_off - off;
354 for (int n = 16; n < num_xmm_regs; n++) {
355 XMMRegister zmm_name = as_XMMRegister(n);
356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
357 off += delta;
358 }
359 }
360 }
361
362 return map;
363 }
364
365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
366 int num_xmm_regs = XMMRegisterImpl::number_of_registers;
367 if (UseAVX < 3) {
368 num_xmm_regs = num_xmm_regs/2;
369 }
370 if (frame::arg_reg_save_area_bytes != 0) {
371 // Pop arg register save area
372 __ addptr(rsp, frame::arg_reg_save_area_bytes);
373 }
374
375 #if COMPILER2_OR_JVMCI
376 if (restore_wide_vectors) {
377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
379 }
380 #else
381 assert(!restore_wide_vectors, "vectors are generated only by C2");
382 #endif
383
384 __ vzeroupper();
385
386 // On EVEX enabled targets everything is handled in pop fpu state
387 if (restore_wide_vectors) {
388 // Restore upper half of YMM registers (0..15)
389 int base_addr = XSAVE_AREA_YMM_BEGIN;
390 for (int n = 0; n < 16; n++) {
391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
392 }
393 if (VM_Version::supports_evex()) {
394 // Restore upper half of ZMM registers (0..15)
395 base_addr = XSAVE_AREA_ZMM_BEGIN;
396 for (int n = 0; n < 16; n++) {
397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
398 }
399 // Restore full ZMM registers(16..num_xmm_regs)
400 base_addr = XSAVE_AREA_UPPERBANK;
401 int vector_len = Assembler::AVX_512bit;
402 int off = 0;
403 for (int n = 16; n < num_xmm_regs; n++) {
404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
405 }
406 #if COMPILER2_OR_JVMCI
407 base_addr = XSAVE_AREA_OPMASK_BEGIN;
408 off = 0;
409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
411 }
412 #endif
413 }
414 } else {
415 if (VM_Version::supports_evex()) {
416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
417 int base_addr = XSAVE_AREA_UPPERBANK;
418 int off = 0;
419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
420 for (int n = 16; n < num_xmm_regs; n++) {
421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
422 }
423 #if COMPILER2_OR_JVMCI
424 base_addr = XSAVE_AREA_OPMASK_BEGIN;
425 off = 0;
426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
428 }
429 #endif
430 }
431 }
432
433 // Recover CPU state
434 __ pop_CPU_state();
435 // Get the rbp described implicitly by the calling convention (no oopMap)
436 __ pop(rbp);
437 }
438
439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
440
441 // Just restore result register. Only used by deoptimization. By
442 // now any callee save register that needs to be restored to a c2
443 // caller of the deoptee has been extracted into the vframeArray
444 // and will be stuffed into the c2i adapter we create for later
445 // restoration so only result registers need to be restored here.
446
447 // Restore fp result register
448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
449 // Restore integer result register
450 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
452
453 // Pop all of the register save are off the stack except the return address
454 __ addptr(rsp, return_offset_in_bytes());
455 }
456
457 // Is vector's size (in bytes) bigger than a size saved by default?
458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
459 bool SharedRuntime::is_wide_vector(int size) {
460 return size > 16;
461 }
462
463 // ---------------------------------------------------------------------------
464 // Read the array of BasicTypes from a signature, and compute where the
465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
466 // quantities. Values less than VMRegImpl::stack0 are registers, those above
467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
468 // as framesizes are fixed.
469 // VMRegImpl::stack0 refers to the first slot 0(sp).
470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register
471 // up to RegisterImpl::number_of_registers) are the 64-bit
472 // integer registers.
473
474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
476 // units regardless of build. Of course for i486 there is no 64 bit build
477
478 // The Java calling convention is a "shifted" version of the C ABI.
479 // By skipping the first C ABI register we can call non-static jni methods
480 // with small numbers of arguments without having to shuffle the arguments
481 // at all. Since we control the java ABI we ought to at least get some
482 // advantage out of it.
483
484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
485 VMRegPair *regs,
486 int total_args_passed) {
487
488 // Create the mapping between argument positions and
489 // registers.
490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
492 };
493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
494 j_farg0, j_farg1, j_farg2, j_farg3,
495 j_farg4, j_farg5, j_farg6, j_farg7
496 };
497
498
499 uint int_args = 0;
500 uint fp_args = 0;
501 uint stk_args = 0; // inc by 2 each time
502
503 for (int i = 0; i < total_args_passed; i++) {
504 switch (sig_bt[i]) {
505 case T_BOOLEAN:
506 case T_CHAR:
507 case T_BYTE:
508 case T_SHORT:
509 case T_INT:
510 if (int_args < Argument::n_int_register_parameters_j) {
511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
512 } else {
513 regs[i].set1(VMRegImpl::stack2reg(stk_args));
514 stk_args += 2;
515 }
516 break;
517 case T_VOID:
518 // halves of T_LONG or T_DOUBLE
519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
520 regs[i].set_bad();
521 break;
522 case T_LONG:
523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
524 // fall through
525 case T_OBJECT:
526 case T_ARRAY:
527 case T_ADDRESS:
528 if (int_args < Argument::n_int_register_parameters_j) {
529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
530 } else {
531 regs[i].set2(VMRegImpl::stack2reg(stk_args));
532 stk_args += 2;
533 }
534 break;
535 case T_FLOAT:
536 if (fp_args < Argument::n_float_register_parameters_j) {
537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
538 } else {
539 regs[i].set1(VMRegImpl::stack2reg(stk_args));
540 stk_args += 2;
541 }
542 break;
543 case T_DOUBLE:
544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
545 if (fp_args < Argument::n_float_register_parameters_j) {
546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
547 } else {
548 regs[i].set2(VMRegImpl::stack2reg(stk_args));
549 stk_args += 2;
550 }
551 break;
552 default:
553 ShouldNotReachHere();
554 break;
555 }
556 }
557
558 return align_up(stk_args, 2);
559 }
560
561 // Patch the callers callsite with entry to compiled code if it exists.
562 static void patch_callers_callsite(MacroAssembler *masm) {
563 Label L;
564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
565 __ jcc(Assembler::equal, L);
566
567 // Save the current stack pointer
568 __ mov(r13, rsp);
569 // Schedule the branch target address early.
570 // Call into the VM to patch the caller, then jump to compiled callee
571 // rax isn't live so capture return address while we easily can
572 __ movptr(rax, Address(rsp, 0));
573
574 // align stack so push_CPU_state doesn't fault
575 __ andptr(rsp, -(StackAlignmentInBytes));
576 __ push_CPU_state();
577 __ vzeroupper();
578 // VM needs caller's callsite
579 // VM needs target method
580 // This needs to be a long call since we will relocate this adapter to
581 // the codeBuffer and it may not reach
582
583 // Allocate argument register save area
584 if (frame::arg_reg_save_area_bytes != 0) {
585 __ subptr(rsp, frame::arg_reg_save_area_bytes);
586 }
587 __ mov(c_rarg0, rbx);
588 __ mov(c_rarg1, rax);
589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
590
591 // De-allocate argument register save area
592 if (frame::arg_reg_save_area_bytes != 0) {
593 __ addptr(rsp, frame::arg_reg_save_area_bytes);
594 }
595
596 __ vzeroupper();
597 __ pop_CPU_state();
598 // restore sp
599 __ mov(rsp, r13);
600 __ bind(L);
601 }
602
603
604 static void gen_c2i_adapter(MacroAssembler *masm,
605 int total_args_passed,
606 int comp_args_on_stack,
607 const BasicType *sig_bt,
608 const VMRegPair *regs,
609 Label& skip_fixup) {
610 // Before we get into the guts of the C2I adapter, see if we should be here
611 // at all. We've come from compiled code and are attempting to jump to the
612 // interpreter, which means the caller made a static call to get here
613 // (vcalls always get a compiled target if there is one). Check for a
614 // compiled target. If there is one, we need to patch the caller's call.
615 patch_callers_callsite(masm);
616
617 __ bind(skip_fixup);
618
619 // Since all args are passed on the stack, total_args_passed *
620 // Interpreter::stackElementSize is the space we need. Plus 1 because
621 // we also account for the return address location since
622 // we store it first rather than hold it in rax across all the shuffling
623
624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
625
626 // stack is aligned, keep it that way
627 extraspace = align_up(extraspace, 2*wordSize);
628
629 // Get return address
630 __ pop(rax);
631
632 // set senderSP value
633 __ mov(r13, rsp);
634
635 __ subptr(rsp, extraspace);
636
637 // Store the return address in the expected location
638 __ movptr(Address(rsp, 0), rax);
639
640 // Now write the args into the outgoing interpreter space
641 for (int i = 0; i < total_args_passed; i++) {
642 if (sig_bt[i] == T_VOID) {
643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
644 continue;
645 }
646
647 // offset to start parameters
648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
649 int next_off = st_off - Interpreter::stackElementSize;
650
651 // Say 4 args:
652 // i st_off
653 // 0 32 T_LONG
654 // 1 24 T_VOID
655 // 2 16 T_OBJECT
656 // 3 8 T_BOOL
657 // - 0 return address
658 //
659 // However to make thing extra confusing. Because we can fit a long/double in
660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
661 // leaves one slot empty and only stores to a single slot. In this case the
662 // slot that is occupied is the T_VOID slot. See I said it was confusing.
663
664 VMReg r_1 = regs[i].first();
665 VMReg r_2 = regs[i].second();
666 if (!r_1->is_valid()) {
667 assert(!r_2->is_valid(), "");
668 continue;
669 }
670 if (r_1->is_stack()) {
671 // memory to memory use rax
672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
673 if (!r_2->is_valid()) {
674 // sign extend??
675 __ movl(rax, Address(rsp, ld_off));
676 __ movptr(Address(rsp, st_off), rax);
677
678 } else {
679
680 __ movq(rax, Address(rsp, ld_off));
681
682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
683 // T_DOUBLE and T_LONG use two slots in the interpreter
684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
685 // ld_off == LSW, ld_off+wordSize == MSW
686 // st_off == MSW, next_off == LSW
687 __ movq(Address(rsp, next_off), rax);
688 #ifdef ASSERT
689 // Overwrite the unused slot with known junk
690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
691 __ movptr(Address(rsp, st_off), rax);
692 #endif /* ASSERT */
693 } else {
694 __ movq(Address(rsp, st_off), rax);
695 }
696 }
697 } else if (r_1->is_Register()) {
698 Register r = r_1->as_Register();
699 if (!r_2->is_valid()) {
700 // must be only an int (or less ) so move only 32bits to slot
701 // why not sign extend??
702 __ movl(Address(rsp, st_off), r);
703 } else {
704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
705 // T_DOUBLE and T_LONG use two slots in the interpreter
706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
707 // long/double in gpr
708 #ifdef ASSERT
709 // Overwrite the unused slot with known junk
710 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
711 __ movptr(Address(rsp, st_off), rax);
712 #endif /* ASSERT */
713 __ movq(Address(rsp, next_off), r);
714 } else {
715 __ movptr(Address(rsp, st_off), r);
716 }
717 }
718 } else {
719 assert(r_1->is_XMMRegister(), "");
720 if (!r_2->is_valid()) {
721 // only a float use just part of the slot
722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
723 } else {
724 #ifdef ASSERT
725 // Overwrite the unused slot with known junk
726 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
727 __ movptr(Address(rsp, st_off), rax);
728 #endif /* ASSERT */
729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
730 }
731 }
732 }
733
734 // Schedule the branch target address early.
735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
736 __ jmp(rcx);
737 }
738
739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
740 address code_start, address code_end,
741 Label& L_ok) {
742 Label L_fail;
743 __ lea(temp_reg, ExternalAddress(code_start));
744 __ cmpptr(pc_reg, temp_reg);
745 __ jcc(Assembler::belowEqual, L_fail);
746 __ lea(temp_reg, ExternalAddress(code_end));
747 __ cmpptr(pc_reg, temp_reg);
748 __ jcc(Assembler::below, L_ok);
749 __ bind(L_fail);
750 }
751
752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
753 int total_args_passed,
754 int comp_args_on_stack,
755 const BasicType *sig_bt,
756 const VMRegPair *regs) {
757
758 // Note: r13 contains the senderSP on entry. We must preserve it since
759 // we may do a i2c -> c2i transition if we lose a race where compiled
760 // code goes non-entrant while we get args ready.
761 // In addition we use r13 to locate all the interpreter args as
762 // we must align the stack to 16 bytes on an i2c entry else we
763 // lose alignment we expect in all compiled code and register
764 // save code can segv when fxsave instructions find improperly
765 // aligned stack pointer.
766
767 // Adapters can be frameless because they do not require the caller
768 // to perform additional cleanup work, such as correcting the stack pointer.
769 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
771 // even if a callee has modified the stack pointer.
772 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
773 // routinely repairs its caller's stack pointer (from sender_sp, which is set
774 // up via the senderSP register).
775 // In other words, if *either* the caller or callee is interpreted, we can
776 // get the stack pointer repaired after a call.
777 // This is why c2i and i2c adapters cannot be indefinitely composed.
778 // In particular, if a c2i adapter were to somehow call an i2c adapter,
779 // both caller and callee would be compiled methods, and neither would
780 // clean up the stack pointer changes performed by the two adapters.
781 // If this happens, control eventually transfers back to the compiled
782 // caller, but with an uncorrected stack, causing delayed havoc.
783
784 // Pick up the return address
785 __ movptr(rax, Address(rsp, 0));
786
787 if (VerifyAdapterCalls &&
788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
789 // So, let's test for cascading c2i/i2c adapters right now.
790 // assert(Interpreter::contains($return_addr) ||
791 // StubRoutines::contains($return_addr),
792 // "i2c adapter must return to an interpreter frame");
793 __ block_comment("verify_i2c { ");
794 Label L_ok;
795 if (Interpreter::code() != NULL)
796 range_check(masm, rax, r11,
797 Interpreter::code()->code_start(), Interpreter::code()->code_end(),
798 L_ok);
799 if (StubRoutines::code1() != NULL)
800 range_check(masm, rax, r11,
801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
802 L_ok);
803 if (StubRoutines::code2() != NULL)
804 range_check(masm, rax, r11,
805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
806 L_ok);
807 const char* msg = "i2c adapter must return to an interpreter frame";
808 __ block_comment(msg);
809 __ stop(msg);
810 __ bind(L_ok);
811 __ block_comment("} verify_i2ce ");
812 }
813
814 // Must preserve original SP for loading incoming arguments because
815 // we need to align the outgoing SP for compiled code.
816 __ movptr(r11, rsp);
817
818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed
819 // in registers, we will occasionally have no stack args.
820 int comp_words_on_stack = 0;
821 if (comp_args_on_stack) {
822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in
823 // registers are below. By subtracting stack0, we either get a negative
824 // number (all values in registers) or the maximum stack slot accessed.
825
826 // Convert 4-byte c2 stack slots to words.
827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
828 // Round up to miminum stack alignment, in wordSize
829 comp_words_on_stack = align_up(comp_words_on_stack, 2);
830 __ subptr(rsp, comp_words_on_stack * wordSize);
831 }
832
833
834 // Ensure compiled code always sees stack at proper alignment
835 __ andptr(rsp, -16);
836
837 // push the return address and misalign the stack that youngest frame always sees
838 // as far as the placement of the call instruction
839 __ push(rax);
840
841 // Put saved SP in another register
842 const Register saved_sp = rax;
843 __ movptr(saved_sp, r11);
844
845 // Will jump to the compiled code just as if compiled code was doing it.
846 // Pre-load the register-jump target early, to schedule it better.
847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
848
849 #if INCLUDE_JVMCI
850 if (EnableJVMCI) {
851 // check if this call should be routed towards a specific entry point
852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
853 Label no_alternative_target;
854 __ jcc(Assembler::equal, no_alternative_target);
855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
857 __ bind(no_alternative_target);
858 }
859 #endif // INCLUDE_JVMCI
860
861 // Now generate the shuffle code. Pick up all register args and move the
862 // rest through the floating point stack top.
863 for (int i = 0; i < total_args_passed; i++) {
864 if (sig_bt[i] == T_VOID) {
865 // Longs and doubles are passed in native word order, but misaligned
866 // in the 32-bit build.
867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
868 continue;
869 }
870
871 // Pick up 0, 1 or 2 words from SP+offset.
872
873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
874 "scrambled load targets?");
875 // Load in argument order going down.
876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
877 // Point to interpreter value (vs. tag)
878 int next_off = ld_off - Interpreter::stackElementSize;
879 //
880 //
881 //
882 VMReg r_1 = regs[i].first();
883 VMReg r_2 = regs[i].second();
884 if (!r_1->is_valid()) {
885 assert(!r_2->is_valid(), "");
886 continue;
887 }
888 if (r_1->is_stack()) {
889 // Convert stack slot to an SP offset (+ wordSize to account for return address )
890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
891
892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
893 // and if we end up going thru a c2i because of a miss a reasonable value of r13
894 // will be generated.
895 if (!r_2->is_valid()) {
896 // sign extend???
897 __ movl(r13, Address(saved_sp, ld_off));
898 __ movptr(Address(rsp, st_off), r13);
899 } else {
900 //
901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
903 // So we must adjust where to pick up the data to match the interpreter.
904 //
905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
906 // are accessed as negative so LSW is at LOW address
907
908 // ld_off is MSW so get LSW
909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
910 next_off : ld_off;
911 __ movq(r13, Address(saved_sp, offset));
912 // st_off is LSW (i.e. reg.first())
913 __ movq(Address(rsp, st_off), r13);
914 }
915 } else if (r_1->is_Register()) { // Register argument
916 Register r = r_1->as_Register();
917 assert(r != rax, "must be different");
918 if (r_2->is_valid()) {
919 //
920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
922 // So we must adjust where to pick up the data to match the interpreter.
923
924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
925 next_off : ld_off;
926
927 // this can be a misaligned move
928 __ movq(r, Address(saved_sp, offset));
929 } else {
930 // sign extend and use a full word?
931 __ movl(r, Address(saved_sp, ld_off));
932 }
933 } else {
934 if (!r_2->is_valid()) {
935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
936 } else {
937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
938 }
939 }
940 }
941
942 // 6243940 We might end up in handle_wrong_method if
943 // the callee is deoptimized as we race thru here. If that
944 // happens we don't want to take a safepoint because the
945 // caller frame will look interpreted and arguments are now
946 // "compiled" so it is much better to make this transition
947 // invisible to the stack walking code. Unfortunately if
948 // we try and find the callee by normal means a safepoint
949 // is possible. So we stash the desired callee in the thread
950 // and the vm will find there should this case occur.
951
952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
953
954 // put Method* where a c2i would expect should we end up there
955 // only needed becaus eof c2 resolve stubs return Method* as a result in
956 // rax
957 __ mov(rax, rbx);
958 __ jmp(r11);
959 }
960
961 // ---------------------------------------------------------------
962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
963 int total_args_passed,
964 int comp_args_on_stack,
965 const BasicType *sig_bt,
966 const VMRegPair *regs,
967 AdapterFingerPrint* fingerprint) {
968 address i2c_entry = __ pc();
969
970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
971
972 // -------------------------------------------------------------------------
973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
974 // to the interpreter. The args start out packed in the compiled layout. They
975 // need to be unpacked into the interpreter layout. This will almost always
976 // require some stack space. We grow the current (compiled) stack, then repack
977 // the args. We finally end in a jump to the generic interpreter entry point.
978 // On exit from the interpreter, the interpreter will restore our SP (lest the
979 // compiled code, which relys solely on SP and not RBP, get sick).
980
981 address c2i_unverified_entry = __ pc();
982 Label skip_fixup;
983 Label ok;
984
985 Register holder = rax;
986 Register receiver = j_rarg0;
987 Register temp = rbx;
988
989 {
990 __ load_klass(temp, receiver, rscratch1);
991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
993 __ jcc(Assembler::equal, ok);
994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
995
996 __ bind(ok);
997 // Method might have been compiled since the call site was patched to
998 // interpreted if that is the case treat it as a miss so we can get
999 // the call site corrected.
1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001 __ jcc(Assembler::equal, skip_fixup);
1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003 }
1004
1005 address c2i_entry = __ pc();
1006
1007 // Class initialization barrier for static methods
1008 address c2i_no_clinit_check_entry = NULL;
1009 if (VM_Version::supports_fast_class_init_checks()) {
1010 Label L_skip_barrier;
1011 Register method = rbx;
1012
1013 { // Bypass the barrier for non-static methods
1014 Register flags = rscratch1;
1015 __ movl(flags, Address(method, Method::access_flags_offset()));
1016 __ testl(flags, JVM_ACC_STATIC);
1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018 }
1019
1020 Register klass = rscratch1;
1021 __ load_method_holder(klass, method);
1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023
1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025
1026 __ bind(L_skip_barrier);
1027 c2i_no_clinit_check_entry = __ pc();
1028 }
1029
1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031 bs->c2i_entry_barrier(masm);
1032
1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034
1035 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1036 }
1037
1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1039 VMRegPair *regs,
1040 VMRegPair *regs2,
1041 int total_args_passed) {
1042 assert(regs2 == NULL, "not needed on x86");
1043 // We return the amount of VMRegImpl stack slots we need to reserve for all
1044 // the arguments NOT counting out_preserve_stack_slots.
1045
1046 // NOTE: These arrays will have to change when c1 is ported
1047 #ifdef _WIN64
1048 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1049 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1050 };
1051 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1052 c_farg0, c_farg1, c_farg2, c_farg3
1053 };
1054 #else
1055 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1057 };
1058 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059 c_farg0, c_farg1, c_farg2, c_farg3,
1060 c_farg4, c_farg5, c_farg6, c_farg7
1061 };
1062 #endif // _WIN64
1063
1064
1065 uint int_args = 0;
1066 uint fp_args = 0;
1067 uint stk_args = 0; // inc by 2 each time
1068
1069 for (int i = 0; i < total_args_passed; i++) {
1070 switch (sig_bt[i]) {
1071 case T_BOOLEAN:
1072 case T_CHAR:
1073 case T_BYTE:
1074 case T_SHORT:
1075 case T_INT:
1076 if (int_args < Argument::n_int_register_parameters_c) {
1077 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1078 #ifdef _WIN64
1079 fp_args++;
1080 // Allocate slots for callee to stuff register args the stack.
1081 stk_args += 2;
1082 #endif
1083 } else {
1084 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1085 stk_args += 2;
1086 }
1087 break;
1088 case T_LONG:
1089 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1090 // fall through
1091 case T_OBJECT:
1092 case T_ARRAY:
1093 case T_ADDRESS:
1094 case T_METADATA:
1095 if (int_args < Argument::n_int_register_parameters_c) {
1096 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098 fp_args++;
1099 stk_args += 2;
1100 #endif
1101 } else {
1102 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1103 stk_args += 2;
1104 }
1105 break;
1106 case T_FLOAT:
1107 if (fp_args < Argument::n_float_register_parameters_c) {
1108 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1109 #ifdef _WIN64
1110 int_args++;
1111 // Allocate slots for callee to stuff register args the stack.
1112 stk_args += 2;
1113 #endif
1114 } else {
1115 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1116 stk_args += 2;
1117 }
1118 break;
1119 case T_DOUBLE:
1120 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1121 if (fp_args < Argument::n_float_register_parameters_c) {
1122 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1123 #ifdef _WIN64
1124 int_args++;
1125 // Allocate slots for callee to stuff register args the stack.
1126 stk_args += 2;
1127 #endif
1128 } else {
1129 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1130 stk_args += 2;
1131 }
1132 break;
1133 case T_VOID: // Halves of longs and doubles
1134 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1135 regs[i].set_bad();
1136 break;
1137 default:
1138 ShouldNotReachHere();
1139 break;
1140 }
1141 }
1142 #ifdef _WIN64
1143 // windows abi requires that we always allocate enough stack space
1144 // for 4 64bit registers to be stored down.
1145 if (stk_args < 8) {
1146 stk_args = 8;
1147 }
1148 #endif // _WIN64
1149
1150 return stk_args;
1151 }
1152
1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1154 uint num_bits,
1155 uint total_args_passed) {
1156 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1157 "only certain vector sizes are supported for now");
1158
1159 static const XMMRegister VEC_ArgReg[32] = {
1160 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1161 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1162 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1163 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1164 };
1165
1166 uint stk_args = 0;
1167 uint fp_args = 0;
1168
1169 for (uint i = 0; i < total_args_passed; i++) {
1170 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1171 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1172 regs[i].set_pair(vmreg->next(next_val), vmreg);
1173 }
1174
1175 return stk_args;
1176 }
1177
1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1179 // We always ignore the frame_slots arg and just use the space just below frame pointer
1180 // which by this time is free to use
1181 switch (ret_type) {
1182 case T_FLOAT:
1183 __ movflt(Address(rbp, -wordSize), xmm0);
1184 break;
1185 case T_DOUBLE:
1186 __ movdbl(Address(rbp, -wordSize), xmm0);
1187 break;
1188 case T_VOID: break;
1189 default: {
1190 __ movptr(Address(rbp, -wordSize), rax);
1191 }
1192 }
1193 }
1194
1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1196 // We always ignore the frame_slots arg and just use the space just below frame pointer
1197 // which by this time is free to use
1198 switch (ret_type) {
1199 case T_FLOAT:
1200 __ movflt(xmm0, Address(rbp, -wordSize));
1201 break;
1202 case T_DOUBLE:
1203 __ movdbl(xmm0, Address(rbp, -wordSize));
1204 break;
1205 case T_VOID: break;
1206 default: {
1207 __ movptr(rax, Address(rbp, -wordSize));
1208 }
1209 }
1210 }
1211
1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1213 for ( int i = first_arg ; i < arg_count ; i++ ) {
1214 if (args[i].first()->is_Register()) {
1215 __ push(args[i].first()->as_Register());
1216 } else if (args[i].first()->is_XMMRegister()) {
1217 __ subptr(rsp, 2*wordSize);
1218 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1219 }
1220 }
1221 }
1222
1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1225 if (args[i].first()->is_Register()) {
1226 __ pop(args[i].first()->as_Register());
1227 } else if (args[i].first()->is_XMMRegister()) {
1228 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1229 __ addptr(rsp, 2*wordSize);
1230 }
1231 }
1232 }
1233
1234 // Unpack an array argument into a pointer to the body and the length
1235 // if the array is non-null, otherwise pass 0 for both.
1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1237 Register tmp_reg = rax;
1238 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1239 "possible collision");
1240 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1241 "possible collision");
1242
1243 __ block_comment("unpack_array_argument {");
1244
1245 // Pass the length, ptr pair
1246 Label is_null, done;
1247 VMRegPair tmp;
1248 tmp.set_ptr(tmp_reg->as_VMReg());
1249 if (reg.first()->is_stack()) {
1250 // Load the arg up from the stack
1251 __ move_ptr(reg, tmp);
1252 reg = tmp;
1253 }
1254 __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1255 __ jccb(Assembler::equal, is_null);
1256 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1257 __ move_ptr(tmp, body_arg);
1258 // load the length relative to the body.
1259 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1260 arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1261 __ move32_64(tmp, length_arg);
1262 __ jmpb(done);
1263 __ bind(is_null);
1264 // Pass zeros
1265 __ xorptr(tmp_reg, tmp_reg);
1266 __ move_ptr(tmp, body_arg);
1267 __ move32_64(tmp, length_arg);
1268 __ bind(done);
1269
1270 __ block_comment("} unpack_array_argument");
1271 }
1272
1273
1274 // Different signatures may require very different orders for the move
1275 // to avoid clobbering other arguments. There's no simple way to
1276 // order them safely. Compute a safe order for issuing stores and
1277 // break any cycles in those stores. This code is fairly general but
1278 // it's not necessary on the other platforms so we keep it in the
1279 // platform dependent code instead of moving it into a shared file.
1280 // (See bugs 7013347 & 7145024.)
1281 // Note that this code is specific to LP64.
1282 class ComputeMoveOrder: public StackObj {
1283 class MoveOperation: public ResourceObj {
1284 friend class ComputeMoveOrder;
1285 private:
1286 VMRegPair _src;
1287 VMRegPair _dst;
1288 int _src_index;
1289 int _dst_index;
1290 bool _processed;
1291 MoveOperation* _next;
1292 MoveOperation* _prev;
1293
1294 static int get_id(VMRegPair r) {
1295 return r.first()->value();
1296 }
1297
1298 public:
1299 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1300 _src(src)
1301 , _dst(dst)
1302 , _src_index(src_index)
1303 , _dst_index(dst_index)
1304 , _processed(false)
1305 , _next(NULL)
1306 , _prev(NULL) {
1307 }
1308
1309 VMRegPair src() const { return _src; }
1310 int src_id() const { return get_id(src()); }
1311 int src_index() const { return _src_index; }
1312 VMRegPair dst() const { return _dst; }
1313 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1314 int dst_index() const { return _dst_index; }
1315 int dst_id() const { return get_id(dst()); }
1316 MoveOperation* next() const { return _next; }
1317 MoveOperation* prev() const { return _prev; }
1318 void set_processed() { _processed = true; }
1319 bool is_processed() const { return _processed; }
1320
1321 // insert
1322 void break_cycle(VMRegPair temp_register) {
1323 // create a new store following the last store
1324 // to move from the temp_register to the original
1325 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1326
1327 // break the cycle of links and insert new_store at the end
1328 // break the reverse link.
1329 MoveOperation* p = prev();
1330 assert(p->next() == this, "must be");
1331 _prev = NULL;
1332 p->_next = new_store;
1333 new_store->_prev = p;
1334
1335 // change the original store to save it's value in the temp.
1336 set_dst(-1, temp_register);
1337 }
1338
1339 void link(GrowableArray<MoveOperation*>& killer) {
1340 // link this store in front the store that it depends on
1341 MoveOperation* n = killer.at_grow(src_id(), NULL);
1342 if (n != NULL) {
1343 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1344 _next = n;
1345 n->_prev = this;
1346 }
1347 }
1348 };
1349
1350 private:
1351 GrowableArray<MoveOperation*> edges;
1352
1353 public:
1354 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1355 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1356 // Move operations where the dest is the stack can all be
1357 // scheduled first since they can't interfere with the other moves.
1358 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1359 if (in_sig_bt[i] == T_ARRAY) {
1360 c_arg--;
1361 if (out_regs[c_arg].first()->is_stack() &&
1362 out_regs[c_arg + 1].first()->is_stack()) {
1363 arg_order.push(i);
1364 arg_order.push(c_arg);
1365 } else {
1366 if (out_regs[c_arg].first()->is_stack() ||
1367 in_regs[i].first() == out_regs[c_arg].first()) {
1368 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1369 } else {
1370 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1371 }
1372 }
1373 } else if (in_sig_bt[i] == T_VOID) {
1374 arg_order.push(i);
1375 arg_order.push(c_arg);
1376 } else {
1377 if (out_regs[c_arg].first()->is_stack() ||
1378 in_regs[i].first() == out_regs[c_arg].first()) {
1379 arg_order.push(i);
1380 arg_order.push(c_arg);
1381 } else {
1382 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1383 }
1384 }
1385 }
1386 // Break any cycles in the register moves and emit the in the
1387 // proper order.
1388 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1389 for (int i = 0; i < stores->length(); i++) {
1390 arg_order.push(stores->at(i)->src_index());
1391 arg_order.push(stores->at(i)->dst_index());
1392 }
1393 }
1394
1395 // Collected all the move operations
1396 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1397 if (src.first() == dst.first()) return;
1398 edges.append(new MoveOperation(src_index, src, dst_index, dst));
1399 }
1400
1401 // Walk the edges breaking cycles between moves. The result list
1402 // can be walked in order to produce the proper set of loads
1403 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1404 // Record which moves kill which values
1405 GrowableArray<MoveOperation*> killer;
1406 for (int i = 0; i < edges.length(); i++) {
1407 MoveOperation* s = edges.at(i);
1408 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1409 killer.at_put_grow(s->dst_id(), s, NULL);
1410 }
1411 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1412 "make sure temp isn't in the registers that are killed");
1413
1414 // create links between loads and stores
1415 for (int i = 0; i < edges.length(); i++) {
1416 edges.at(i)->link(killer);
1417 }
1418
1419 // at this point, all the move operations are chained together
1420 // in a doubly linked list. Processing it backwards finds
1421 // the beginning of the chain, forwards finds the end. If there's
1422 // a cycle it can be broken at any point, so pick an edge and walk
1423 // backward until the list ends or we end where we started.
1424 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1425 for (int e = 0; e < edges.length(); e++) {
1426 MoveOperation* s = edges.at(e);
1427 if (!s->is_processed()) {
1428 MoveOperation* start = s;
1429 // search for the beginning of the chain or cycle
1430 while (start->prev() != NULL && start->prev() != s) {
1431 start = start->prev();
1432 }
1433 if (start->prev() == s) {
1434 start->break_cycle(temp_register);
1435 }
1436 // walk the chain forward inserting to store list
1437 while (start != NULL) {
1438 stores->append(start);
1439 start->set_processed();
1440 start = start->next();
1441 }
1442 }
1443 }
1444 return stores;
1445 }
1446 };
1447
1448 static void verify_oop_args(MacroAssembler* masm,
1449 const methodHandle& method,
1450 const BasicType* sig_bt,
1451 const VMRegPair* regs) {
1452 Register temp_reg = rbx; // not part of any compiled calling seq
1453 if (VerifyOops) {
1454 for (int i = 0; i < method->size_of_parameters(); i++) {
1455 if (is_reference_type(sig_bt[i])) {
1456 VMReg r = regs[i].first();
1457 assert(r->is_valid(), "bad oop arg");
1458 if (r->is_stack()) {
1459 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1460 __ verify_oop(temp_reg);
1461 } else {
1462 __ verify_oop(r->as_Register());
1463 }
1464 }
1465 }
1466 }
1467 }
1468
1469 static void gen_special_dispatch(MacroAssembler* masm,
1470 const methodHandle& method,
1471 const BasicType* sig_bt,
1472 const VMRegPair* regs) {
1473 verify_oop_args(masm, method, sig_bt, regs);
1474 vmIntrinsics::ID iid = method->intrinsic_id();
1475
1476 // Now write the args into the outgoing interpreter space
1477 bool has_receiver = false;
1478 Register receiver_reg = noreg;
1479 int member_arg_pos = -1;
1480 Register member_reg = noreg;
1481 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1482 if (ref_kind != 0) {
1483 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1484 member_reg = rbx; // known to be free at this point
1485 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1486 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1487 has_receiver = true;
1488 } else {
1489 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1490 }
1491
1492 if (member_reg != noreg) {
1493 // Load the member_arg into register, if necessary.
1494 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1495 VMReg r = regs[member_arg_pos].first();
1496 if (r->is_stack()) {
1497 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1498 } else {
1499 // no data motion is needed
1500 member_reg = r->as_Register();
1501 }
1502 }
1503
1504 if (has_receiver) {
1505 // Make sure the receiver is loaded into a register.
1506 assert(method->size_of_parameters() > 0, "oob");
1507 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1508 VMReg r = regs[0].first();
1509 assert(r->is_valid(), "bad receiver arg");
1510 if (r->is_stack()) {
1511 // Porting note: This assumes that compiled calling conventions always
1512 // pass the receiver oop in a register. If this is not true on some
1513 // platform, pick a temp and load the receiver from stack.
1514 fatal("receiver always in a register");
1515 receiver_reg = j_rarg0; // known to be free at this point
1516 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1517 } else {
1518 // no data motion is needed
1519 receiver_reg = r->as_Register();
1520 }
1521 }
1522
1523 // Figure out which address we are really jumping to:
1524 MethodHandles::generate_method_handle_dispatch(masm, iid,
1525 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1526 }
1527
1528 // ---------------------------------------------------------------------------
1529 // Generate a native wrapper for a given method. The method takes arguments
1530 // in the Java compiled code convention, marshals them to the native
1531 // convention (handlizes oops, etc), transitions to native, makes the call,
1532 // returns to java state (possibly blocking), unhandlizes any result and
1533 // returns.
1534 //
1535 // Critical native functions are a shorthand for the use of
1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1537 // functions. The wrapper is expected to unpack the arguments before
1538 // passing them to the callee. Critical native functions leave the state _in_Java,
1539 // since they cannot stop for GC.
1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1541 // block and the check for pending exceptions it's impossible for them
1542 // to be thrown.
1543 //
1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1545 const methodHandle& method,
1546 int compile_id,
1547 BasicType* in_sig_bt,
1548 VMRegPair* in_regs,
1549 BasicType ret_type,
1550 address critical_entry) {
1551 if (method->is_method_handle_intrinsic()) {
1552 vmIntrinsics::ID iid = method->intrinsic_id();
1553 intptr_t start = (intptr_t)__ pc();
1554 int vep_offset = ((intptr_t)__ pc()) - start;
1555 gen_special_dispatch(masm,
1556 method,
1557 in_sig_bt,
1558 in_regs);
1559 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1560 __ flush();
1561 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1562 return nmethod::new_native_nmethod(method,
1563 compile_id,
1564 masm->code(),
1565 vep_offset,
1566 frame_complete,
1567 stack_slots / VMRegImpl::slots_per_word,
1568 in_ByteSize(-1),
1569 in_ByteSize(-1),
1570 (OopMapSet*)NULL);
1571 }
1572 bool is_critical_native = true;
1573 address native_func = critical_entry;
1574 if (native_func == NULL) {
1575 native_func = method->native_function();
1576 is_critical_native = false;
1577 }
1578 assert(native_func != NULL, "must have function");
1579
1580 // An OopMap for lock (and class if static)
1581 OopMapSet *oop_maps = new OopMapSet();
1582 intptr_t start = (intptr_t)__ pc();
1583
1584 // We have received a description of where all the java arg are located
1585 // on entry to the wrapper. We need to convert these args to where
1586 // the jni function will expect them. To figure out where they go
1587 // we convert the java signature to a C signature by inserting
1588 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1589
1590 const int total_in_args = method->size_of_parameters();
1591 int total_c_args = total_in_args;
1592 if (!is_critical_native) {
1593 total_c_args += 1;
1594 if (method->is_static()) {
1595 total_c_args++;
1596 }
1597 } else {
1598 for (int i = 0; i < total_in_args; i++) {
1599 if (in_sig_bt[i] == T_ARRAY) {
1600 total_c_args++;
1601 }
1602 }
1603 }
1604
1605 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1606 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1607 BasicType* in_elem_bt = NULL;
1608
1609 int argc = 0;
1610 if (!is_critical_native) {
1611 out_sig_bt[argc++] = T_ADDRESS;
1612 if (method->is_static()) {
1613 out_sig_bt[argc++] = T_OBJECT;
1614 }
1615
1616 for (int i = 0; i < total_in_args ; i++ ) {
1617 out_sig_bt[argc++] = in_sig_bt[i];
1618 }
1619 } else {
1620 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1621 SignatureStream ss(method->signature());
1622 for (int i = 0; i < total_in_args ; i++ ) {
1623 if (in_sig_bt[i] == T_ARRAY) {
1624 // Arrays are passed as int, elem* pair
1625 out_sig_bt[argc++] = T_INT;
1626 out_sig_bt[argc++] = T_ADDRESS;
1627 ss.skip_array_prefix(1); // skip one '['
1628 assert(ss.is_primitive(), "primitive type expected");
1629 in_elem_bt[i] = ss.type();
1630 } else {
1631 out_sig_bt[argc++] = in_sig_bt[i];
1632 in_elem_bt[i] = T_VOID;
1633 }
1634 if (in_sig_bt[i] != T_VOID) {
1635 assert(in_sig_bt[i] == ss.type() ||
1636 in_sig_bt[i] == T_ARRAY, "must match");
1637 ss.next();
1638 }
1639 }
1640 }
1641
1642 // Now figure out where the args must be stored and how much stack space
1643 // they require.
1644 int out_arg_slots;
1645 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1646
1647 // Compute framesize for the wrapper. We need to handlize all oops in
1648 // incoming registers
1649
1650 // Calculate the total number of stack slots we will need.
1651
1652 // First count the abi requirement plus all of the outgoing args
1653 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1654
1655 // Now the space for the inbound oop handle area
1656 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1657 if (is_critical_native) {
1658 // Critical natives may have to call out so they need a save area
1659 // for register arguments.
1660 int double_slots = 0;
1661 int single_slots = 0;
1662 for ( int i = 0; i < total_in_args; i++) {
1663 if (in_regs[i].first()->is_Register()) {
1664 const Register reg = in_regs[i].first()->as_Register();
1665 switch (in_sig_bt[i]) {
1666 case T_BOOLEAN:
1667 case T_BYTE:
1668 case T_SHORT:
1669 case T_CHAR:
1670 case T_INT: single_slots++; break;
1671 case T_ARRAY: // specific to LP64 (7145024)
1672 case T_LONG: double_slots++; break;
1673 default: ShouldNotReachHere();
1674 }
1675 } else if (in_regs[i].first()->is_XMMRegister()) {
1676 switch (in_sig_bt[i]) {
1677 case T_FLOAT: single_slots++; break;
1678 case T_DOUBLE: double_slots++; break;
1679 default: ShouldNotReachHere();
1680 }
1681 } else if (in_regs[i].first()->is_FloatRegister()) {
1682 ShouldNotReachHere();
1683 }
1684 }
1685 total_save_slots = double_slots * 2 + single_slots;
1686 // align the save area
1687 if (double_slots != 0) {
1688 stack_slots = align_up(stack_slots, 2);
1689 }
1690 }
1691
1692 int oop_handle_offset = stack_slots;
1693 stack_slots += total_save_slots;
1694
1695 // Now any space we need for handlizing a klass if static method
1696
1697 int klass_slot_offset = 0;
1698 int klass_offset = -1;
1699 int lock_slot_offset = 0;
1700 bool is_static = false;
1701
1702 if (method->is_static()) {
1703 klass_slot_offset = stack_slots;
1704 stack_slots += VMRegImpl::slots_per_word;
1705 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1706 is_static = true;
1707 }
1708
1709 // Plus a lock if needed
1710
1711 if (method->is_synchronized()) {
1712 lock_slot_offset = stack_slots;
1713 stack_slots += VMRegImpl::slots_per_word;
1714 }
1715
1716 // Now a place (+2) to save return values or temp during shuffling
1717 // + 4 for return address (which we own) and saved rbp
1718 stack_slots += 6;
1719
1720 // Ok The space we have allocated will look like:
1721 //
1722 //
1723 // FP-> | |
1724 // |---------------------|
1725 // | 2 slots for moves |
1726 // |---------------------|
1727 // | lock box (if sync) |
1728 // |---------------------| <- lock_slot_offset
1729 // | klass (if static) |
1730 // |---------------------| <- klass_slot_offset
1731 // | oopHandle area |
1732 // |---------------------| <- oop_handle_offset (6 java arg registers)
1733 // | outbound memory |
1734 // | based arguments |
1735 // | |
1736 // |---------------------|
1737 // | |
1738 // SP-> | out_preserved_slots |
1739 //
1740 //
1741
1742
1743 // Now compute actual number of stack words we need rounding to make
1744 // stack properly aligned.
1745 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1746
1747 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1748
1749 // First thing make an ic check to see if we should even be here
1750
1751 // We are free to use all registers as temps without saving them and
1752 // restoring them except rbp. rbp is the only callee save register
1753 // as far as the interpreter and the compiler(s) are concerned.
1754
1755
1756 const Register ic_reg = rax;
1757 const Register receiver = j_rarg0;
1758
1759 Label hit;
1760 Label exception_pending;
1761
1762 assert_different_registers(ic_reg, receiver, rscratch1);
1763 __ verify_oop(receiver);
1764 __ load_klass(rscratch1, receiver, rscratch2);
1765 __ cmpq(ic_reg, rscratch1);
1766 __ jcc(Assembler::equal, hit);
1767
1768 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1769
1770 // Verified entry point must be aligned
1771 __ align(8);
1772
1773 __ bind(hit);
1774
1775 int vep_offset = ((intptr_t)__ pc()) - start;
1776
1777 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1778 Label L_skip_barrier;
1779 Register klass = r10;
1780 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1781 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1782
1783 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1784
1785 __ bind(L_skip_barrier);
1786 }
1787
1788 #ifdef COMPILER1
1789 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1790 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1791 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1792 }
1793 #endif // COMPILER1
1794
1795 // The instruction at the verified entry point must be 5 bytes or longer
1796 // because it can be patched on the fly by make_non_entrant. The stack bang
1797 // instruction fits that requirement.
1798
1799 // Generate stack overflow check
1800 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1801
1802 // Generate a new frame for the wrapper.
1803 __ enter();
1804 // -2 because return address is already present and so is saved rbp
1805 __ subptr(rsp, stack_size - 2*wordSize);
1806
1807 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1808 bs->nmethod_entry_barrier(masm);
1809
1810 // Frame is now completed as far as size and linkage.
1811 int frame_complete = ((intptr_t)__ pc()) - start;
1812
1813 if (UseRTMLocking) {
1814 // Abort RTM transaction before calling JNI
1815 // because critical section will be large and will be
1816 // aborted anyway. Also nmethod could be deoptimized.
1817 __ xabort(0);
1818 }
1819
1820 #ifdef ASSERT
1821 {
1822 Label L;
1823 __ mov(rax, rsp);
1824 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1825 __ cmpptr(rax, rsp);
1826 __ jcc(Assembler::equal, L);
1827 __ stop("improperly aligned stack");
1828 __ bind(L);
1829 }
1830 #endif /* ASSERT */
1831
1832
1833 // We use r14 as the oop handle for the receiver/klass
1834 // It is callee save so it survives the call to native
1835
1836 const Register oop_handle_reg = r14;
1837
1838 //
1839 // We immediately shuffle the arguments so that any vm call we have to
1840 // make from here on out (sync slow path, jvmti, etc.) we will have
1841 // captured the oops from our caller and have a valid oopMap for
1842 // them.
1843
1844 // -----------------
1845 // The Grand Shuffle
1846
1847 // The Java calling convention is either equal (linux) or denser (win64) than the
1848 // c calling convention. However the because of the jni_env argument the c calling
1849 // convention always has at least one more (and two for static) arguments than Java.
1850 // Therefore if we move the args from java -> c backwards then we will never have
1851 // a register->register conflict and we don't have to build a dependency graph
1852 // and figure out how to break any cycles.
1853 //
1854
1855 // Record esp-based slot for receiver on stack for non-static methods
1856 int receiver_offset = -1;
1857
1858 // This is a trick. We double the stack slots so we can claim
1859 // the oops in the caller's frame. Since we are sure to have
1860 // more args than the caller doubling is enough to make
1861 // sure we can capture all the incoming oop args from the
1862 // caller.
1863 //
1864 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1865
1866 // Mark location of rbp (someday)
1867 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1868
1869 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1870 // All inbound args are referenced based on rbp and all outbound args via rsp.
1871
1872
1873 #ifdef ASSERT
1874 bool reg_destroyed[RegisterImpl::number_of_registers];
1875 bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1876 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1877 reg_destroyed[r] = false;
1878 }
1879 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1880 freg_destroyed[f] = false;
1881 }
1882
1883 #endif /* ASSERT */
1884
1885 // This may iterate in two different directions depending on the
1886 // kind of native it is. The reason is that for regular JNI natives
1887 // the incoming and outgoing registers are offset upwards and for
1888 // critical natives they are offset down.
1889 GrowableArray<int> arg_order(2 * total_in_args);
1890
1891 VMRegPair tmp_vmreg;
1892 tmp_vmreg.set2(rbx->as_VMReg());
1893
1894 if (!is_critical_native) {
1895 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1896 arg_order.push(i);
1897 arg_order.push(c_arg);
1898 }
1899 } else {
1900 // Compute a valid move order, using tmp_vmreg to break any cycles
1901 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1902 }
1903
1904 int temploc = -1;
1905 for (int ai = 0; ai < arg_order.length(); ai += 2) {
1906 int i = arg_order.at(ai);
1907 int c_arg = arg_order.at(ai + 1);
1908 __ block_comment(err_msg("move %d -> %d", i, c_arg));
1909 if (c_arg == -1) {
1910 assert(is_critical_native, "should only be required for critical natives");
1911 // This arg needs to be moved to a temporary
1912 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1913 in_regs[i] = tmp_vmreg;
1914 temploc = i;
1915 continue;
1916 } else if (i == -1) {
1917 assert(is_critical_native, "should only be required for critical natives");
1918 // Read from the temporary location
1919 assert(temploc != -1, "must be valid");
1920 i = temploc;
1921 temploc = -1;
1922 }
1923 #ifdef ASSERT
1924 if (in_regs[i].first()->is_Register()) {
1925 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1926 } else if (in_regs[i].first()->is_XMMRegister()) {
1927 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1928 }
1929 if (out_regs[c_arg].first()->is_Register()) {
1930 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1931 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1932 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1933 }
1934 #endif /* ASSERT */
1935 switch (in_sig_bt[i]) {
1936 case T_ARRAY:
1937 if (is_critical_native) {
1938 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1939 c_arg++;
1940 #ifdef ASSERT
1941 if (out_regs[c_arg].first()->is_Register()) {
1942 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1943 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1944 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1945 }
1946 #endif
1947 break;
1948 }
1949 case T_OBJECT:
1950 assert(!is_critical_native, "no oop arguments");
1951 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1952 ((i == 0) && (!is_static)),
1953 &receiver_offset);
1954 break;
1955 case T_VOID:
1956 break;
1957
1958 case T_FLOAT:
1959 __ float_move(in_regs[i], out_regs[c_arg]);
1960 break;
1961
1962 case T_DOUBLE:
1963 assert( i + 1 < total_in_args &&
1964 in_sig_bt[i + 1] == T_VOID &&
1965 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1966 __ double_move(in_regs[i], out_regs[c_arg]);
1967 break;
1968
1969 case T_LONG :
1970 __ long_move(in_regs[i], out_regs[c_arg]);
1971 break;
1972
1973 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1974
1975 default:
1976 __ move32_64(in_regs[i], out_regs[c_arg]);
1977 }
1978 }
1979
1980 int c_arg;
1981
1982 // Pre-load a static method's oop into r14. Used both by locking code and
1983 // the normal JNI call code.
1984 if (!is_critical_native) {
1985 // point c_arg at the first arg that is already loaded in case we
1986 // need to spill before we call out
1987 c_arg = total_c_args - total_in_args;
1988
1989 if (method->is_static()) {
1990
1991 // load oop into a register
1992 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1993
1994 // Now handlize the static class mirror it's known not-null.
1995 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1996 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1997
1998 // Now get the handle
1999 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2000 // store the klass handle as second argument
2001 __ movptr(c_rarg1, oop_handle_reg);
2002 // and protect the arg if we must spill
2003 c_arg--;
2004 }
2005 } else {
2006 // For JNI critical methods we need to save all registers in save_args.
2007 c_arg = 0;
2008 }
2009
2010 // Change state to native (we save the return address in the thread, since it might not
2011 // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2012 // points into the right code segment. It does not have to be the correct return pc.
2013 // We use the same pc/oopMap repeatedly when we call out
2014
2015 intptr_t the_pc = (intptr_t) __ pc();
2016 oop_maps->add_gc_map(the_pc - start, map);
2017
2018 __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2019
2020
2021 // We have all of the arguments setup at this point. We must not touch any register
2022 // argument registers at this point (what if we save/restore them there are no oop?
2023
2024 {
2025 SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2026 // protect the args we've loaded
2027 save_args(masm, total_c_args, c_arg, out_regs);
2028 __ mov_metadata(c_rarg1, method());
2029 __ call_VM_leaf(
2030 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2031 r15_thread, c_rarg1);
2032 restore_args(masm, total_c_args, c_arg, out_regs);
2033 }
2034
2035 // RedefineClasses() tracing support for obsolete method entry
2036 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2037 // protect the args we've loaded
2038 save_args(masm, total_c_args, c_arg, out_regs);
2039 __ mov_metadata(c_rarg1, method());
2040 __ call_VM_leaf(
2041 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2042 r15_thread, c_rarg1);
2043 restore_args(masm, total_c_args, c_arg, out_regs);
2044 }
2045
2046 // Lock a synchronized method
2047
2048 // Register definitions used by locking and unlocking
2049
2050 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2051 const Register obj_reg = rbx; // Will contain the oop
2052 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2053 const Register old_hdr = r13; // value of old header at unlock time
2054
2055 Label slow_path_lock;
2056 Label lock_done;
2057
2058 if (method->is_synchronized()) {
2059 assert(!is_critical_native, "unhandled");
2060
2061
2062 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2063
2064 // Get the handle (the 2nd argument)
2065 __ mov(oop_handle_reg, c_rarg1);
2066
2067 // Get address of the box
2068
2069 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2070
2071 // Load the oop from the handle
2072 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2073
2074 if (UseBiasedLocking) {
2075 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2076 }
2077
2078 // Load immediate 1 into swap_reg %rax
2079 __ movl(swap_reg, 1);
2080
2081 // Load (object->mark() | 1) into swap_reg %rax
2082 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2083
2084 // Save (object->mark() | 1) into BasicLock's displaced header
2085 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2086
2087 // src -> dest iff dest == rax else rax <- dest
2088 __ lock();
2089 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2090 __ jcc(Assembler::equal, lock_done);
2091
2092 // Hmm should this move to the slow path code area???
2093
2094 // Test if the oopMark is an obvious stack pointer, i.e.,
2095 // 1) (mark & 3) == 0, and
2096 // 2) rsp <= mark < mark + os::pagesize()
2097 // These 3 tests can be done by evaluating the following
2098 // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2099 // assuming both stack pointer and pagesize have their
2100 // least significant 2 bits clear.
2101 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2102
2103 __ subptr(swap_reg, rsp);
2104 __ andptr(swap_reg, 3 - os::vm_page_size());
2105
2106 // Save the test result, for recursive case, the result is zero
2107 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2108 __ jcc(Assembler::notEqual, slow_path_lock);
2109
2110 // Slow path will re-enter here
2111
2112 __ bind(lock_done);
2113 }
2114
2115 // Finally just about ready to make the JNI call
2116
2117 // get JNIEnv* which is first argument to native
2118 if (!is_critical_native) {
2119 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2120
2121 // Now set thread in native
2122 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2123 }
2124
2125 __ call(RuntimeAddress(native_func));
2126
2127 // Verify or restore cpu control state after JNI call
2128 __ restore_cpu_control_state_after_jni();
2129
2130 // Unpack native results.
2131 switch (ret_type) {
2132 case T_BOOLEAN: __ c2bool(rax); break;
2133 case T_CHAR : __ movzwl(rax, rax); break;
2134 case T_BYTE : __ sign_extend_byte (rax); break;
2135 case T_SHORT : __ sign_extend_short(rax); break;
2136 case T_INT : /* nothing to do */ break;
2137 case T_DOUBLE :
2138 case T_FLOAT :
2139 // Result is in xmm0 we'll save as needed
2140 break;
2141 case T_ARRAY: // Really a handle
2142 case T_OBJECT: // Really a handle
2143 break; // can't de-handlize until after safepoint check
2144 case T_VOID: break;
2145 case T_LONG: break;
2146 default : ShouldNotReachHere();
2147 }
2148
2149 Label after_transition;
2150
2151 // If this is a critical native, check for a safepoint or suspend request after the call.
2152 // If a safepoint is needed, transition to native, then to native_trans to handle
2153 // safepoints like the native methods that are not critical natives.
2154 if (is_critical_native) {
2155 Label needs_safepoint;
2156 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2157 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2158 __ jcc(Assembler::equal, after_transition);
2159 __ bind(needs_safepoint);
2160 }
2161
2162 // Switch thread to "native transition" state before reading the synchronization state.
2163 // This additional state is necessary because reading and testing the synchronization
2164 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2165 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2166 // VM thread changes sync state to synchronizing and suspends threads for GC.
2167 // Thread A is resumed to finish this native method, but doesn't block here since it
2168 // didn't see any synchronization is progress, and escapes.
2169 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2170
2171 // Force this write out before the read below
2172 __ membar(Assembler::Membar_mask_bits(
2173 Assembler::LoadLoad | Assembler::LoadStore |
2174 Assembler::StoreLoad | Assembler::StoreStore));
2175
2176 // check for safepoint operation in progress and/or pending suspend requests
2177 {
2178 Label Continue;
2179 Label slow_path;
2180
2181 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2182
2183 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2184 __ jcc(Assembler::equal, Continue);
2185 __ bind(slow_path);
2186
2187 // Don't use call_VM as it will see a possible pending exception and forward it
2188 // and never return here preventing us from clearing _last_native_pc down below.
2189 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2190 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2191 // by hand.
2192 //
2193 __ vzeroupper();
2194 save_native_result(masm, ret_type, stack_slots);
2195 __ mov(c_rarg0, r15_thread);
2196 __ mov(r12, rsp); // remember sp
2197 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2198 __ andptr(rsp, -16); // align stack as required by ABI
2199 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2200 __ mov(rsp, r12); // restore sp
2201 __ reinit_heapbase();
2202 // Restore any method result value
2203 restore_native_result(masm, ret_type, stack_slots);
2204 __ bind(Continue);
2205 }
2206
2207 // change thread state
2208 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2209 __ bind(after_transition);
2210
2211 Label reguard;
2212 Label reguard_done;
2213 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2214 __ jcc(Assembler::equal, reguard);
2215 __ bind(reguard_done);
2216
2217 // native result if any is live
2218
2219 // Unlock
2220 Label unlock_done;
2221 Label slow_path_unlock;
2222 if (method->is_synchronized()) {
2223
2224 // Get locked oop from the handle we passed to jni
2225 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2226
2227 Label done;
2228
2229 if (UseBiasedLocking) {
2230 __ biased_locking_exit(obj_reg, old_hdr, done);
2231 }
2232
2233 // Simple recursive lock?
2234
2235 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2236 __ jcc(Assembler::equal, done);
2237
2238 // Must save rax if if it is live now because cmpxchg must use it
2239 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2240 save_native_result(masm, ret_type, stack_slots);
2241 }
2242
2243
2244 // get address of the stack lock
2245 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2246 // get old displaced header
2247 __ movptr(old_hdr, Address(rax, 0));
2248
2249 // Atomic swap old header if oop still contains the stack lock
2250 __ lock();
2251 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2252 __ jcc(Assembler::notEqual, slow_path_unlock);
2253
2254 // slow path re-enters here
2255 __ bind(unlock_done);
2256 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2257 restore_native_result(masm, ret_type, stack_slots);
2258 }
2259
2260 __ bind(done);
2261
2262 }
2263 {
2264 SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2265 save_native_result(masm, ret_type, stack_slots);
2266 __ mov_metadata(c_rarg1, method());
2267 __ call_VM_leaf(
2268 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2269 r15_thread, c_rarg1);
2270 restore_native_result(masm, ret_type, stack_slots);
2271 }
2272
2273 __ reset_last_Java_frame(false);
2274
2275 // Unbox oop result, e.g. JNIHandles::resolve value.
2276 if (is_reference_type(ret_type)) {
2277 __ resolve_jobject(rax /* value */,
2278 r15_thread /* thread */,
2279 rcx /* tmp */);
2280 }
2281
2282 if (CheckJNICalls) {
2283 // clear_pending_jni_exception_check
2284 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2285 }
2286
2287 if (!is_critical_native) {
2288 // reset handle block
2289 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2290 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2291 }
2292
2293 // pop our frame
2294
2295 __ leave();
2296
2297 if (!is_critical_native) {
2298 // Any exception pending?
2299 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2300 __ jcc(Assembler::notEqual, exception_pending);
2301 }
2302
2303 // Return
2304
2305 __ ret(0);
2306
2307 // Unexpected paths are out of line and go here
2308
2309 if (!is_critical_native) {
2310 // forward the exception
2311 __ bind(exception_pending);
2312
2313 // and forward the exception
2314 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2315 }
2316
2317 // Slow path locking & unlocking
2318 if (method->is_synchronized()) {
2319
2320 // BEGIN Slow path lock
2321 __ bind(slow_path_lock);
2322
2323 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2324 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2325
2326 // protect the args we've loaded
2327 save_args(masm, total_c_args, c_arg, out_regs);
2328
2329 __ mov(c_rarg0, obj_reg);
2330 __ mov(c_rarg1, lock_reg);
2331 __ mov(c_rarg2, r15_thread);
2332
2333 // Not a leaf but we have last_Java_frame setup as we want
2334 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2335 restore_args(masm, total_c_args, c_arg, out_regs);
2336
2337 #ifdef ASSERT
2338 { Label L;
2339 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2340 __ jcc(Assembler::equal, L);
2341 __ stop("no pending exception allowed on exit from monitorenter");
2342 __ bind(L);
2343 }
2344 #endif
2345 __ jmp(lock_done);
2346
2347 // END Slow path lock
2348
2349 // BEGIN Slow path unlock
2350 __ bind(slow_path_unlock);
2351
2352 // If we haven't already saved the native result we must save it now as xmm registers
2353 // are still exposed.
2354 __ vzeroupper();
2355 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2356 save_native_result(masm, ret_type, stack_slots);
2357 }
2358
2359 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2360
2361 __ mov(c_rarg0, obj_reg);
2362 __ mov(c_rarg2, r15_thread);
2363 __ mov(r12, rsp); // remember sp
2364 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2365 __ andptr(rsp, -16); // align stack as required by ABI
2366
2367 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2368 // NOTE that obj_reg == rbx currently
2369 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2370 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2371
2372 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2373 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2374 __ mov(rsp, r12); // restore sp
2375 __ reinit_heapbase();
2376 #ifdef ASSERT
2377 {
2378 Label L;
2379 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2380 __ jcc(Assembler::equal, L);
2381 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2382 __ bind(L);
2383 }
2384 #endif /* ASSERT */
2385
2386 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2387
2388 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2389 restore_native_result(masm, ret_type, stack_slots);
2390 }
2391 __ jmp(unlock_done);
2392
2393 // END Slow path unlock
2394
2395 } // synchronized
2396
2397 // SLOW PATH Reguard the stack if needed
2398
2399 __ bind(reguard);
2400 __ vzeroupper();
2401 save_native_result(masm, ret_type, stack_slots);
2402 __ mov(r12, rsp); // remember sp
2403 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2404 __ andptr(rsp, -16); // align stack as required by ABI
2405 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2406 __ mov(rsp, r12); // restore sp
2407 __ reinit_heapbase();
2408 restore_native_result(masm, ret_type, stack_slots);
2409 // and continue
2410 __ jmp(reguard_done);
2411
2412
2413
2414 __ flush();
2415
2416 nmethod *nm = nmethod::new_native_nmethod(method,
2417 compile_id,
2418 masm->code(),
2419 vep_offset,
2420 frame_complete,
2421 stack_slots / VMRegImpl::slots_per_word,
2422 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2423 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2424 oop_maps);
2425
2426 return nm;
2427 }
2428
2429 // this function returns the adjust size (in number of words) to a c2i adapter
2430 // activation for use during deoptimization
2431 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2432 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2433 }
2434
2435
2436 uint SharedRuntime::out_preserve_stack_slots() {
2437 return 0;
2438 }
2439
2440
2441 // Number of stack slots between incoming argument block and the start of
2442 // a new frame. The PROLOG must add this many slots to the stack. The
2443 // EPILOG must remove this many slots. amd64 needs two slots for
2444 // return address.
2445 uint SharedRuntime::in_preserve_stack_slots() {
2446 return 4 + 2 * VerifyStackAtCalls;
2447 }
2448
2449 //------------------------------generate_deopt_blob----------------------------
2450 void SharedRuntime::generate_deopt_blob() {
2451 // Allocate space for the code
2452 ResourceMark rm;
2453 // Setup code generation tools
2454 int pad = 0;
2455 if (UseAVX > 2) {
2456 pad += 1024;
2457 }
2458 #if INCLUDE_JVMCI
2459 if (EnableJVMCI) {
2460 pad += 512; // Increase the buffer size when compiling for JVMCI
2461 }
2462 #endif
2463 CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2464 MacroAssembler* masm = new MacroAssembler(&buffer);
2465 int frame_size_in_words;
2466 OopMap* map = NULL;
2467 OopMapSet *oop_maps = new OopMapSet();
2468
2469 // -------------
2470 // This code enters when returning to a de-optimized nmethod. A return
2471 // address has been pushed on the the stack, and return values are in
2472 // registers.
2473 // If we are doing a normal deopt then we were called from the patched
2474 // nmethod from the point we returned to the nmethod. So the return
2475 // address on the stack is wrong by NativeCall::instruction_size
2476 // We will adjust the value so it looks like we have the original return
2477 // address on the stack (like when we eagerly deoptimized).
2478 // In the case of an exception pending when deoptimizing, we enter
2479 // with a return address on the stack that points after the call we patched
2480 // into the exception handler. We have the following register state from,
2481 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2482 // rax: exception oop
2483 // rbx: exception handler
2484 // rdx: throwing pc
2485 // So in this case we simply jam rdx into the useless return address and
2486 // the stack looks just like we want.
2487 //
2488 // At this point we need to de-opt. We save the argument return
2489 // registers. We call the first C routine, fetch_unroll_info(). This
2490 // routine captures the return values and returns a structure which
2491 // describes the current frame size and the sizes of all replacement frames.
2492 // The current frame is compiled code and may contain many inlined
2493 // functions, each with their own JVM state. We pop the current frame, then
2494 // push all the new frames. Then we call the C routine unpack_frames() to
2495 // populate these frames. Finally unpack_frames() returns us the new target
2496 // address. Notice that callee-save registers are BLOWN here; they have
2497 // already been captured in the vframeArray at the time the return PC was
2498 // patched.
2499 address start = __ pc();
2500 Label cont;
2501
2502 // Prolog for non exception case!
2503
2504 // Save everything in sight.
2505 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2506
2507 // Normal deoptimization. Save exec mode for unpack_frames.
2508 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2509 __ jmp(cont);
2510
2511 int reexecute_offset = __ pc() - start;
2512 #if INCLUDE_JVMCI && !defined(COMPILER1)
2513 if (EnableJVMCI && UseJVMCICompiler) {
2514 // JVMCI does not use this kind of deoptimization
2515 __ should_not_reach_here();
2516 }
2517 #endif
2518
2519 // Reexecute case
2520 // return address is the pc describes what bci to do re-execute at
2521
2522 // No need to update map as each call to save_live_registers will produce identical oopmap
2523 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2524
2525 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2526 __ jmp(cont);
2527
2528 #if INCLUDE_JVMCI
2529 Label after_fetch_unroll_info_call;
2530 int implicit_exception_uncommon_trap_offset = 0;
2531 int uncommon_trap_offset = 0;
2532
2533 if (EnableJVMCI) {
2534 implicit_exception_uncommon_trap_offset = __ pc() - start;
2535
2536 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2537 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2538
2539 uncommon_trap_offset = __ pc() - start;
2540
2541 // Save everything in sight.
2542 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2543 // fetch_unroll_info needs to call last_java_frame()
2544 __ set_last_Java_frame(noreg, noreg, NULL);
2545
2546 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2547 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2548
2549 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2550 __ mov(c_rarg0, r15_thread);
2551 __ movl(c_rarg2, r14); // exec mode
2552 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2553 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2554
2555 __ reset_last_Java_frame(false);
2556
2557 __ jmp(after_fetch_unroll_info_call);
2558 } // EnableJVMCI
2559 #endif // INCLUDE_JVMCI
2560
2561 int exception_offset = __ pc() - start;
2562
2563 // Prolog for exception case
2564
2565 // all registers are dead at this entry point, except for rax, and
2566 // rdx which contain the exception oop and exception pc
2567 // respectively. Set them in TLS and fall thru to the
2568 // unpack_with_exception_in_tls entry point.
2569
2570 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2571 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2572
2573 int exception_in_tls_offset = __ pc() - start;
2574
2575 // new implementation because exception oop is now passed in JavaThread
2576
2577 // Prolog for exception case
2578 // All registers must be preserved because they might be used by LinearScan
2579 // Exceptiop oop and throwing PC are passed in JavaThread
2580 // tos: stack at point of call to method that threw the exception (i.e. only
2581 // args are on the stack, no return address)
2582
2583 // make room on stack for the return address
2584 // It will be patched later with the throwing pc. The correct value is not
2585 // available now because loading it from memory would destroy registers.
2586 __ push(0);
2587
2588 // Save everything in sight.
2589 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2590
2591 // Now it is safe to overwrite any register
2592
2593 // Deopt during an exception. Save exec mode for unpack_frames.
2594 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2595
2596 // load throwing pc from JavaThread and patch it as the return address
2597 // of the current frame. Then clear the field in JavaThread
2598
2599 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2600 __ movptr(Address(rbp, wordSize), rdx);
2601 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2602
2603 #ifdef ASSERT
2604 // verify that there is really an exception oop in JavaThread
2605 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2606 __ verify_oop(rax);
2607
2608 // verify that there is no pending exception
2609 Label no_pending_exception;
2610 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2611 __ testptr(rax, rax);
2612 __ jcc(Assembler::zero, no_pending_exception);
2613 __ stop("must not have pending exception here");
2614 __ bind(no_pending_exception);
2615 #endif
2616
2617 __ bind(cont);
2618
2619 // Call C code. Need thread and this frame, but NOT official VM entry
2620 // crud. We cannot block on this call, no GC can happen.
2621 //
2622 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2623
2624 // fetch_unroll_info needs to call last_java_frame().
2625
2626 __ set_last_Java_frame(noreg, noreg, NULL);
2627 #ifdef ASSERT
2628 { Label L;
2629 __ cmpptr(Address(r15_thread,
2630 JavaThread::last_Java_fp_offset()),
2631 (int32_t)0);
2632 __ jcc(Assembler::equal, L);
2633 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2634 __ bind(L);
2635 }
2636 #endif // ASSERT
2637 __ mov(c_rarg0, r15_thread);
2638 __ movl(c_rarg1, r14); // exec_mode
2639 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2640
2641 // Need to have an oopmap that tells fetch_unroll_info where to
2642 // find any register it might need.
2643 oop_maps->add_gc_map(__ pc() - start, map);
2644
2645 __ reset_last_Java_frame(false);
2646
2647 #if INCLUDE_JVMCI
2648 if (EnableJVMCI) {
2649 __ bind(after_fetch_unroll_info_call);
2650 }
2651 #endif
2652
2653 // Load UnrollBlock* into rdi
2654 __ mov(rdi, rax);
2655
2656 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2657 Label noException;
2658 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2659 __ jcc(Assembler::notEqual, noException);
2660 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2661 // QQQ this is useless it was NULL above
2662 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2663 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2664 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2665
2666 __ verify_oop(rax);
2667
2668 // Overwrite the result registers with the exception results.
2669 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2670 // I think this is useless
2671 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2672
2673 __ bind(noException);
2674
2675 // Only register save data is on the stack.
2676 // Now restore the result registers. Everything else is either dead
2677 // or captured in the vframeArray.
2678 RegisterSaver::restore_result_registers(masm);
2679
2680 // All of the register save area has been popped of the stack. Only the
2681 // return address remains.
2682
2683 // Pop all the frames we must move/replace.
2684 //
2685 // Frame picture (youngest to oldest)
2686 // 1: self-frame (no frame link)
2687 // 2: deopting frame (no frame link)
2688 // 3: caller of deopting frame (could be compiled/interpreted).
2689 //
2690 // Note: by leaving the return address of self-frame on the stack
2691 // and using the size of frame 2 to adjust the stack
2692 // when we are done the return to frame 3 will still be on the stack.
2693
2694 // Pop deoptimized frame
2695 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2696 __ addptr(rsp, rcx);
2697
2698 // rsp should be pointing at the return address to the caller (3)
2699
2700 // Pick up the initial fp we should save
2701 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2702 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2703
2704 #ifdef ASSERT
2705 // Compilers generate code that bang the stack by as much as the
2706 // interpreter would need. So this stack banging should never
2707 // trigger a fault. Verify that it does not on non product builds.
2708 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2709 __ bang_stack_size(rbx, rcx);
2710 #endif
2711
2712 // Load address of array of frame pcs into rcx
2713 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2714
2715 // Trash the old pc
2716 __ addptr(rsp, wordSize);
2717
2718 // Load address of array of frame sizes into rsi
2719 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2720
2721 // Load counter into rdx
2722 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2723
2724 // Now adjust the caller's stack to make up for the extra locals
2725 // but record the original sp so that we can save it in the skeletal interpreter
2726 // frame and the stack walking of interpreter_sender will get the unextended sp
2727 // value and not the "real" sp value.
2728
2729 const Register sender_sp = r8;
2730
2731 __ mov(sender_sp, rsp);
2732 __ movl(rbx, Address(rdi,
2733 Deoptimization::UnrollBlock::
2734 caller_adjustment_offset_in_bytes()));
2735 __ subptr(rsp, rbx);
2736
2737 // Push interpreter frames in a loop
2738 Label loop;
2739 __ bind(loop);
2740 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2741 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2742 __ pushptr(Address(rcx, 0)); // Save return address
2743 __ enter(); // Save old & set new ebp
2744 __ subptr(rsp, rbx); // Prolog
2745 // This value is corrected by layout_activation_impl
2746 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2747 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2748 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2749 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2750 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2751 __ decrementl(rdx); // Decrement counter
2752 __ jcc(Assembler::notZero, loop);
2753 __ pushptr(Address(rcx, 0)); // Save final return address
2754
2755 // Re-push self-frame
2756 __ enter(); // Save old & set new ebp
2757
2758 // Allocate a full sized register save area.
2759 // Return address and rbp are in place, so we allocate two less words.
2760 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2761
2762 // Restore frame locals after moving the frame
2763 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2764 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2765
2766 // Call C code. Need thread but NOT official VM entry
2767 // crud. We cannot block on this call, no GC can happen. Call should
2768 // restore return values to their stack-slots with the new SP.
2769 //
2770 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2771
2772 // Use rbp because the frames look interpreted now
2773 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2774 // Don't need the precise return PC here, just precise enough to point into this code blob.
2775 address the_pc = __ pc();
2776 __ set_last_Java_frame(noreg, rbp, the_pc);
2777
2778 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2779 __ mov(c_rarg0, r15_thread);
2780 __ movl(c_rarg1, r14); // second arg: exec_mode
2781 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2782 // Revert SP alignment after call since we're going to do some SP relative addressing below
2783 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2784
2785 // Set an oopmap for the call site
2786 // Use the same PC we used for the last java frame
2787 oop_maps->add_gc_map(the_pc - start,
2788 new OopMap( frame_size_in_words, 0 ));
2789
2790 // Clear fp AND pc
2791 __ reset_last_Java_frame(true);
2792
2793 // Collect return values
2794 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2795 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2796 // I think this is useless (throwing pc?)
2797 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2798
2799 // Pop self-frame.
2800 __ leave(); // Epilog
2801
2802 // Jump to interpreter
2803 __ ret(0);
2804
2805 // Make sure all code is generated
2806 masm->flush();
2807
2808 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2809 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2810 #if INCLUDE_JVMCI
2811 if (EnableJVMCI) {
2812 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2813 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2814 }
2815 #endif
2816 }
2817
2818 #ifdef COMPILER2
2819 //------------------------------generate_uncommon_trap_blob--------------------
2820 void SharedRuntime::generate_uncommon_trap_blob() {
2821 // Allocate space for the code
2822 ResourceMark rm;
2823 // Setup code generation tools
2824 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2825 MacroAssembler* masm = new MacroAssembler(&buffer);
2826
2827 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2828
2829 address start = __ pc();
2830
2831 if (UseRTMLocking) {
2832 // Abort RTM transaction before possible nmethod deoptimization.
2833 __ xabort(0);
2834 }
2835
2836 // Push self-frame. We get here with a return address on the
2837 // stack, so rsp is 8-byte aligned until we allocate our frame.
2838 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2839
2840 // No callee saved registers. rbp is assumed implicitly saved
2841 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2842
2843 // compiler left unloaded_class_index in j_rarg0 move to where the
2844 // runtime expects it.
2845 __ movl(c_rarg1, j_rarg0);
2846
2847 __ set_last_Java_frame(noreg, noreg, NULL);
2848
2849 // Call C code. Need thread but NOT official VM entry
2850 // crud. We cannot block on this call, no GC can happen. Call should
2851 // capture callee-saved registers as well as return values.
2852 // Thread is in rdi already.
2853 //
2854 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2855
2856 __ mov(c_rarg0, r15_thread);
2857 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2858 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2859
2860 // Set an oopmap for the call site
2861 OopMapSet* oop_maps = new OopMapSet();
2862 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2863
2864 // location of rbp is known implicitly by the frame sender code
2865
2866 oop_maps->add_gc_map(__ pc() - start, map);
2867
2868 __ reset_last_Java_frame(false);
2869
2870 // Load UnrollBlock* into rdi
2871 __ mov(rdi, rax);
2872
2873 #ifdef ASSERT
2874 { Label L;
2875 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2876 (int32_t)Deoptimization::Unpack_uncommon_trap);
2877 __ jcc(Assembler::equal, L);
2878 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2879 __ bind(L);
2880 }
2881 #endif
2882
2883 // Pop all the frames we must move/replace.
2884 //
2885 // Frame picture (youngest to oldest)
2886 // 1: self-frame (no frame link)
2887 // 2: deopting frame (no frame link)
2888 // 3: caller of deopting frame (could be compiled/interpreted).
2889
2890 // Pop self-frame. We have no frame, and must rely only on rax and rsp.
2891 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2892
2893 // Pop deoptimized frame (int)
2894 __ movl(rcx, Address(rdi,
2895 Deoptimization::UnrollBlock::
2896 size_of_deoptimized_frame_offset_in_bytes()));
2897 __ addptr(rsp, rcx);
2898
2899 // rsp should be pointing at the return address to the caller (3)
2900
2901 // Pick up the initial fp we should save
2902 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2903 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2904
2905 #ifdef ASSERT
2906 // Compilers generate code that bang the stack by as much as the
2907 // interpreter would need. So this stack banging should never
2908 // trigger a fault. Verify that it does not on non product builds.
2909 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2910 __ bang_stack_size(rbx, rcx);
2911 #endif
2912
2913 // Load address of array of frame pcs into rcx (address*)
2914 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2915
2916 // Trash the return pc
2917 __ addptr(rsp, wordSize);
2918
2919 // Load address of array of frame sizes into rsi (intptr_t*)
2920 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2921
2922 // Counter
2923 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2924
2925 // Now adjust the caller's stack to make up for the extra locals but
2926 // record the original sp so that we can save it in the skeletal
2927 // interpreter frame and the stack walking of interpreter_sender
2928 // will get the unextended sp value and not the "real" sp value.
2929
2930 const Register sender_sp = r8;
2931
2932 __ mov(sender_sp, rsp);
2933 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2934 __ subptr(rsp, rbx);
2935
2936 // Push interpreter frames in a loop
2937 Label loop;
2938 __ bind(loop);
2939 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2940 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand
2941 __ pushptr(Address(rcx, 0)); // Save return address
2942 __ enter(); // Save old & set new rbp
2943 __ subptr(rsp, rbx); // Prolog
2944 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2945 sender_sp); // Make it walkable
2946 // This value is corrected by layout_activation_impl
2947 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2948 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2949 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2950 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2951 __ decrementl(rdx); // Decrement counter
2952 __ jcc(Assembler::notZero, loop);
2953 __ pushptr(Address(rcx, 0)); // Save final return address
2954
2955 // Re-push self-frame
2956 __ enter(); // Save old & set new rbp
2957 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2958 // Prolog
2959
2960 // Use rbp because the frames look interpreted now
2961 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2962 // Don't need the precise return PC here, just precise enough to point into this code blob.
2963 address the_pc = __ pc();
2964 __ set_last_Java_frame(noreg, rbp, the_pc);
2965
2966 // Call C code. Need thread but NOT official VM entry
2967 // crud. We cannot block on this call, no GC can happen. Call should
2968 // restore return values to their stack-slots with the new SP.
2969 // Thread is in rdi already.
2970 //
2971 // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2972
2973 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2974 __ mov(c_rarg0, r15_thread);
2975 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2976 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2977
2978 // Set an oopmap for the call site
2979 // Use the same PC we used for the last java frame
2980 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2981
2982 // Clear fp AND pc
2983 __ reset_last_Java_frame(true);
2984
2985 // Pop self-frame.
2986 __ leave(); // Epilog
2987
2988 // Jump to interpreter
2989 __ ret(0);
2990
2991 // Make sure all code is generated
2992 masm->flush();
2993
2994 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps,
2995 SimpleRuntimeFrame::framesize >> 1);
2996 }
2997 #endif // COMPILER2
2998
2999 //------------------------------generate_handler_blob------
3000 //
3001 // Generate a special Compile2Runtime blob that saves all registers,
3002 // and setup oopmap.
3003 //
3004 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3005 assert(StubRoutines::forward_exception_entry() != NULL,
3006 "must be generated before");
3007
3008 ResourceMark rm;
3009 OopMapSet *oop_maps = new OopMapSet();
3010 OopMap* map;
3011
3012 // Allocate space for the code. Setup code generation tools.
3013 CodeBuffer buffer("handler_blob", 2048, 1024);
3014 MacroAssembler* masm = new MacroAssembler(&buffer);
3015
3016 address start = __ pc();
3017 address call_pc = NULL;
3018 int frame_size_in_words;
3019 bool cause_return = (poll_type == POLL_AT_RETURN);
3020 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3021
3022 if (UseRTMLocking) {
3023 // Abort RTM transaction before calling runtime
3024 // because critical section will be large and will be
3025 // aborted anyway. Also nmethod could be deoptimized.
3026 __ xabort(0);
3027 }
3028
3029 // Make room for return address (or push it again)
3030 if (!cause_return) {
3031 __ push(rbx);
3032 }
3033
3034 // Save registers, fpu state, and flags
3035 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3036
3037 // The following is basically a call_VM. However, we need the precise
3038 // address of the call in order to generate an oopmap. Hence, we do all the
3039 // work outselves.
3040
3041 __ set_last_Java_frame(noreg, noreg, NULL);
3042
3043 // The return address must always be correct so that frame constructor never
3044 // sees an invalid pc.
3045
3046 if (!cause_return) {
3047 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3048 // Additionally, rbx is a callee saved register and we can look at it later to determine
3049 // if someone changed the return address for us!
3050 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3051 __ movptr(Address(rbp, wordSize), rbx);
3052 }
3053
3054 // Do the call
3055 __ mov(c_rarg0, r15_thread);
3056 __ call(RuntimeAddress(call_ptr));
3057
3058 // Set an oopmap for the call site. This oopmap will map all
3059 // oop-registers and debug-info registers as callee-saved. This
3060 // will allow deoptimization at this safepoint to find all possible
3061 // debug-info recordings, as well as let GC find all oops.
3062
3063 oop_maps->add_gc_map( __ pc() - start, map);
3064
3065 Label noException;
3066
3067 __ reset_last_Java_frame(false);
3068
3069 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3070 __ jcc(Assembler::equal, noException);
3071
3072 // Exception pending
3073
3074 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3075
3076 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3077
3078 // No exception case
3079 __ bind(noException);
3080
3081 Label no_adjust;
3082 #ifdef ASSERT
3083 Label bail;
3084 #endif
3085 if (!cause_return) {
3086 Label no_prefix, not_special;
3087
3088 // If our stashed return pc was modified by the runtime we avoid touching it
3089 __ cmpptr(rbx, Address(rbp, wordSize));
3090 __ jccb(Assembler::notEqual, no_adjust);
3091
3092 // Skip over the poll instruction.
3093 // See NativeInstruction::is_safepoint_poll()
3094 // Possible encodings:
3095 // 85 00 test %eax,(%rax)
3096 // 85 01 test %eax,(%rcx)
3097 // 85 02 test %eax,(%rdx)
3098 // 85 03 test %eax,(%rbx)
3099 // 85 06 test %eax,(%rsi)
3100 // 85 07 test %eax,(%rdi)
3101 //
3102 // 41 85 00 test %eax,(%r8)
3103 // 41 85 01 test %eax,(%r9)
3104 // 41 85 02 test %eax,(%r10)
3105 // 41 85 03 test %eax,(%r11)
3106 // 41 85 06 test %eax,(%r14)
3107 // 41 85 07 test %eax,(%r15)
3108 //
3109 // 85 04 24 test %eax,(%rsp)
3110 // 41 85 04 24 test %eax,(%r12)
3111 // 85 45 00 test %eax,0x0(%rbp)
3112 // 41 85 45 00 test %eax,0x0(%r13)
3113
3114 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3115 __ jcc(Assembler::notEqual, no_prefix);
3116 __ addptr(rbx, 1);
3117 __ bind(no_prefix);
3118 #ifdef ASSERT
3119 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3120 #endif
3121 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3122 // r12/rsp 0x04
3123 // r13/rbp 0x05
3124 __ movzbq(rcx, Address(rbx, 1));
3125 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3126 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3127 __ cmpptr(rcx, 1);
3128 __ jcc(Assembler::above, not_special);
3129 __ addptr(rbx, 1);
3130 __ bind(not_special);
3131 #ifdef ASSERT
3132 // Verify the correct encoding of the poll we're about to skip.
3133 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3134 __ jcc(Assembler::notEqual, bail);
3135 // Mask out the modrm bits
3136 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3137 // rax encodes to 0, so if the bits are nonzero it's incorrect
3138 __ jcc(Assembler::notZero, bail);
3139 #endif
3140 // Adjust return pc forward to step over the safepoint poll instruction
3141 __ addptr(rbx, 2);
3142 __ movptr(Address(rbp, wordSize), rbx);
3143 }
3144
3145 __ bind(no_adjust);
3146 // Normal exit, restore registers and exit.
3147 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3148 __ ret(0);
3149
3150 #ifdef ASSERT
3151 __ bind(bail);
3152 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3153 #endif
3154
3155 // Make sure all code is generated
3156 masm->flush();
3157
3158 // Fill-out other meta info
3159 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3160 }
3161
3162 //
3163 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3164 //
3165 // Generate a stub that calls into vm to find out the proper destination
3166 // of a java call. All the argument registers are live at this point
3167 // but since this is generic code we don't know what they are and the caller
3168 // must do any gc of the args.
3169 //
3170 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3171 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3172
3173 // allocate space for the code
3174 ResourceMark rm;
3175
3176 CodeBuffer buffer(name, 1000, 512);
3177 MacroAssembler* masm = new MacroAssembler(&buffer);
3178
3179 int frame_size_in_words;
3180
3181 OopMapSet *oop_maps = new OopMapSet();
3182 OopMap* map = NULL;
3183
3184 int start = __ offset();
3185
3186 // No need to save vector registers since they are caller-saved anyway.
3187 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3188
3189 int frame_complete = __ offset();
3190
3191 __ set_last_Java_frame(noreg, noreg, NULL);
3192
3193 __ mov(c_rarg0, r15_thread);
3194
3195 __ call(RuntimeAddress(destination));
3196
3197
3198 // Set an oopmap for the call site.
3199 // We need this not only for callee-saved registers, but also for volatile
3200 // registers that the compiler might be keeping live across a safepoint.
3201
3202 oop_maps->add_gc_map( __ offset() - start, map);
3203
3204 // rax contains the address we are going to jump to assuming no exception got installed
3205
3206 // clear last_Java_sp
3207 __ reset_last_Java_frame(false);
3208 // check for pending exceptions
3209 Label pending;
3210 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3211 __ jcc(Assembler::notEqual, pending);
3212
3213 // get the returned Method*
3214 __ get_vm_result_2(rbx, r15_thread);
3215 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3216
3217 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3218
3219 RegisterSaver::restore_live_registers(masm);
3220
3221 // We are back the the original state on entry and ready to go.
3222
3223 __ jmp(rax);
3224
3225 // Pending exception after the safepoint
3226
3227 __ bind(pending);
3228
3229 RegisterSaver::restore_live_registers(masm);
3230
3231 // exception pending => remove activation and forward to exception handler
3232
3233 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3234
3235 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3236 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3237
3238 // -------------
3239 // make sure all code is generated
3240 masm->flush();
3241
3242 // return the blob
3243 // frame_size_words or bytes??
3244 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3245 }
3246
3247 #ifdef COMPILER2
3248 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3249
3250 class NativeInvokerGenerator : public StubCodeGenerator {
3251 address _call_target;
3252 int _shadow_space_bytes;
3253
3254 const GrowableArray<VMReg>& _input_registers;
3255 const GrowableArray<VMReg>& _output_registers;
3256
3257 int _frame_complete;
3258 int _framesize;
3259 OopMapSet* _oop_maps;
3260 public:
3261 NativeInvokerGenerator(CodeBuffer* buffer,
3262 address call_target,
3263 int shadow_space_bytes,
3264 const GrowableArray<VMReg>& input_registers,
3265 const GrowableArray<VMReg>& output_registers)
3266 : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3267 _call_target(call_target),
3268 _shadow_space_bytes(shadow_space_bytes),
3269 _input_registers(input_registers),
3270 _output_registers(output_registers),
3271 _frame_complete(0),
3272 _framesize(0),
3273 _oop_maps(NULL) {
3274 assert(_output_registers.length() <= 1
3275 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3276
3277 }
3278
3279 void generate();
3280
3281 int spill_size_in_bytes() const {
3282 if (_output_registers.length() == 0) {
3283 return 0;
3284 }
3285 VMReg reg = _output_registers.at(0);
3286 assert(reg->is_reg(), "must be a register");
3287 if (reg->is_Register()) {
3288 return 8;
3289 } else if (reg->is_XMMRegister()) {
3290 if (UseAVX >= 3) {
3291 return 64;
3292 } else if (UseAVX >= 1) {
3293 return 32;
3294 } else {
3295 return 16;
3296 }
3297 } else {
3298 ShouldNotReachHere();
3299 }
3300 return 0;
3301 }
3302
3303 void spill_out_registers() {
3304 if (_output_registers.length() == 0) {
3305 return;
3306 }
3307 VMReg reg = _output_registers.at(0);
3308 assert(reg->is_reg(), "must be a register");
3309 MacroAssembler* masm = _masm;
3310 if (reg->is_Register()) {
3311 __ movptr(Address(rsp, 0), reg->as_Register());
3312 } else if (reg->is_XMMRegister()) {
3313 if (UseAVX >= 3) {
3314 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3315 } else if (UseAVX >= 1) {
3316 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3317 } else {
3318 __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3319 }
3320 } else {
3321 ShouldNotReachHere();
3322 }
3323 }
3324
3325 void fill_out_registers() {
3326 if (_output_registers.length() == 0) {
3327 return;
3328 }
3329 VMReg reg = _output_registers.at(0);
3330 assert(reg->is_reg(), "must be a register");
3331 MacroAssembler* masm = _masm;
3332 if (reg->is_Register()) {
3333 __ movptr(reg->as_Register(), Address(rsp, 0));
3334 } else if (reg->is_XMMRegister()) {
3335 if (UseAVX >= 3) {
3336 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3337 } else if (UseAVX >= 1) {
3338 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3339 } else {
3340 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3341 }
3342 } else {
3343 ShouldNotReachHere();
3344 }
3345 }
3346
3347 int frame_complete() const {
3348 return _frame_complete;
3349 }
3350
3351 int framesize() const {
3352 return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3353 }
3354
3355 OopMapSet* oop_maps() const {
3356 return _oop_maps;
3357 }
3358
3359 private:
3360 #ifdef ASSERT
3361 bool target_uses_register(VMReg reg) {
3362 return _input_registers.contains(reg) || _output_registers.contains(reg);
3363 }
3364 #endif
3365 };
3366
3367 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3368 int shadow_space_bytes,
3369 const GrowableArray<VMReg>& input_registers,
3370 const GrowableArray<VMReg>& output_registers) {
3371 int locs_size = 64;
3372 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3373 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3374 g.generate();
3375 code.log_section_sizes("nep_invoker_blob");
3376
3377 RuntimeStub* stub =
3378 RuntimeStub::new_runtime_stub("nep_invoker_blob",
3379 &code,
3380 g.frame_complete(),
3381 g.framesize(),
3382 g.oop_maps(), false);
3383 return stub;
3384 }
3385
3386 void NativeInvokerGenerator::generate() {
3387 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3388
3389 enum layout {
3390 rbp_off,
3391 rbp_off2,
3392 return_off,
3393 return_off2,
3394 framesize // inclusive of return address
3395 };
3396
3397 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3398 assert(is_even(_framesize/2), "sp not 16-byte aligned");
3399
3400 _oop_maps = new OopMapSet();
3401 MacroAssembler* masm = _masm;
3402
3403 address start = __ pc();
3404
3405 __ enter();
3406
3407 // return address and rbp are already in place
3408 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3409
3410 _frame_complete = __ pc() - start;
3411
3412 address the_pc = __ pc();
3413
3414 __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3415 OopMap* map = new OopMap(_framesize, 0);
3416 _oop_maps->add_gc_map(the_pc - start, map);
3417
3418 // State transition
3419 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3420
3421 __ call(RuntimeAddress(_call_target));
3422
3423 __ restore_cpu_control_state_after_jni();
3424
3425 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3426
3427 // Force this write out before the read below
3428 __ membar(Assembler::Membar_mask_bits(
3429 Assembler::LoadLoad | Assembler::LoadStore |
3430 Assembler::StoreLoad | Assembler::StoreStore));
3431
3432 Label L_after_safepoint_poll;
3433 Label L_safepoint_poll_slow_path;
3434
3435 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3436 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3437 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3438
3439 __ bind(L_after_safepoint_poll);
3440
3441 // change thread state
3442 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3443
3444 __ block_comment("reguard stack check");
3445 Label L_reguard;
3446 Label L_after_reguard;
3447 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3448 __ jcc(Assembler::equal, L_reguard);
3449 __ bind(L_after_reguard);
3450
3451 __ reset_last_Java_frame(r15_thread, true);
3452
3453 __ leave(); // required for proper stackwalking of RuntimeStub frame
3454 __ ret(0);
3455
3456 //////////////////////////////////////////////////////////////////////////////
3457
3458 __ block_comment("{ L_safepoint_poll_slow_path");
3459 __ bind(L_safepoint_poll_slow_path);
3460 __ vzeroupper();
3461
3462 spill_out_registers();
3463
3464 __ mov(c_rarg0, r15_thread);
3465 __ mov(r12, rsp); // remember sp
3466 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3467 __ andptr(rsp, -16); // align stack as required by ABI
3468 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3469 __ mov(rsp, r12); // restore sp
3470 __ reinit_heapbase();
3471
3472 fill_out_registers();
3473
3474 __ jmp(L_after_safepoint_poll);
3475 __ block_comment("} L_safepoint_poll_slow_path");
3476
3477 //////////////////////////////////////////////////////////////////////////////
3478
3479 __ block_comment("{ L_reguard");
3480 __ bind(L_reguard);
3481 __ vzeroupper();
3482
3483 spill_out_registers();
3484
3485 __ mov(r12, rsp); // remember sp
3486 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3487 __ andptr(rsp, -16); // align stack as required by ABI
3488 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3489 __ mov(rsp, r12); // restore sp
3490 __ reinit_heapbase();
3491
3492 fill_out_registers();
3493
3494 __ jmp(L_after_reguard);
3495
3496 __ block_comment("} L_reguard");
3497
3498 //////////////////////////////////////////////////////////////////////////////
3499
3500 __ flush();
3501 }
3502 #endif // COMPILER2
3503
3504 //------------------------------Montgomery multiplication------------------------
3505 //
3506
3507 #ifndef _WINDOWS
3508
3509 // Subtract 0:b from carry:a. Return carry.
3510 static julong
3511 sub(julong a[], julong b[], julong carry, long len) {
3512 long long i = 0, cnt = len;
3513 julong tmp;
3514 asm volatile("clc; "
3515 "0: ; "
3516 "mov (%[b], %[i], 8), %[tmp]; "
3517 "sbb %[tmp], (%[a], %[i], 8); "
3518 "inc %[i]; dec %[cnt]; "
3519 "jne 0b; "
3520 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3521 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3522 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3523 : "memory");
3524 return tmp;
3525 }
3526
3527 // Multiply (unsigned) Long A by Long B, accumulating the double-
3528 // length result into the accumulator formed of T0, T1, and T2.
3529 #define MACC(A, B, T0, T1, T2) \
3530 do { \
3531 unsigned long hi, lo; \
3532 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3533 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3534 : "r"(A), "a"(B) : "cc"); \
3535 } while(0)
3536
3537 // As above, but add twice the double-length result into the
3538 // accumulator.
3539 #define MACC2(A, B, T0, T1, T2) \
3540 do { \
3541 unsigned long hi, lo; \
3542 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3543 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3544 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3545 : "r"(A), "a"(B) : "cc"); \
3546 } while(0)
3547
3548 #else //_WINDOWS
3549
3550 static julong
3551 sub(julong a[], julong b[], julong carry, long len) {
3552 long i;
3553 julong tmp;
3554 unsigned char c = 1;
3555 for (i = 0; i < len; i++) {
3556 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3557 a[i] = tmp;
3558 }
3559 c = _addcarry_u64(c, carry, ~0, &tmp);
3560 return tmp;
3561 }
3562
3563 // Multiply (unsigned) Long A by Long B, accumulating the double-
3564 // length result into the accumulator formed of T0, T1, and T2.
3565 #define MACC(A, B, T0, T1, T2) \
3566 do { \
3567 julong hi, lo; \
3568 lo = _umul128(A, B, &hi); \
3569 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3570 c = _addcarry_u64(c, hi, T1, &T1); \
3571 _addcarry_u64(c, T2, 0, &T2); \
3572 } while(0)
3573
3574 // As above, but add twice the double-length result into the
3575 // accumulator.
3576 #define MACC2(A, B, T0, T1, T2) \
3577 do { \
3578 julong hi, lo; \
3579 lo = _umul128(A, B, &hi); \
3580 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3581 c = _addcarry_u64(c, hi, T1, &T1); \
3582 _addcarry_u64(c, T2, 0, &T2); \
3583 c = _addcarry_u64(0, lo, T0, &T0); \
3584 c = _addcarry_u64(c, hi, T1, &T1); \
3585 _addcarry_u64(c, T2, 0, &T2); \
3586 } while(0)
3587
3588 #endif //_WINDOWS
3589
3590 // Fast Montgomery multiplication. The derivation of the algorithm is
3591 // in A Cryptographic Library for the Motorola DSP56000,
3592 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3593
3594 static void NOINLINE
3595 montgomery_multiply(julong a[], julong b[], julong n[],
3596 julong m[], julong inv, int len) {
3597 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3598 int i;
3599
3600 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3601
3602 for (i = 0; i < len; i++) {
3603 int j;
3604 for (j = 0; j < i; j++) {
3605 MACC(a[j], b[i-j], t0, t1, t2);
3606 MACC(m[j], n[i-j], t0, t1, t2);
3607 }
3608 MACC(a[i], b[0], t0, t1, t2);
3609 m[i] = t0 * inv;
3610 MACC(m[i], n[0], t0, t1, t2);
3611
3612 assert(t0 == 0, "broken Montgomery multiply");
3613
3614 t0 = t1; t1 = t2; t2 = 0;
3615 }
3616
3617 for (i = len; i < 2*len; i++) {
3618 int j;
3619 for (j = i-len+1; j < len; j++) {
3620 MACC(a[j], b[i-j], t0, t1, t2);
3621 MACC(m[j], n[i-j], t0, t1, t2);
3622 }
3623 m[i-len] = t0;
3624 t0 = t1; t1 = t2; t2 = 0;
3625 }
3626
3627 while (t0)
3628 t0 = sub(m, n, t0, len);
3629 }
3630
3631 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3632 // multiplies so it should be up to 25% faster than Montgomery
3633 // multiplication. However, its loop control is more complex and it
3634 // may actually run slower on some machines.
3635
3636 static void NOINLINE
3637 montgomery_square(julong a[], julong n[],
3638 julong m[], julong inv, int len) {
3639 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3640 int i;
3641
3642 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3643
3644 for (i = 0; i < len; i++) {
3645 int j;
3646 int end = (i+1)/2;
3647 for (j = 0; j < end; j++) {
3648 MACC2(a[j], a[i-j], t0, t1, t2);
3649 MACC(m[j], n[i-j], t0, t1, t2);
3650 }
3651 if ((i & 1) == 0) {
3652 MACC(a[j], a[j], t0, t1, t2);
3653 }
3654 for (; j < i; j++) {
3655 MACC(m[j], n[i-j], t0, t1, t2);
3656 }
3657 m[i] = t0 * inv;
3658 MACC(m[i], n[0], t0, t1, t2);
3659
3660 assert(t0 == 0, "broken Montgomery square");
3661
3662 t0 = t1; t1 = t2; t2 = 0;
3663 }
3664
3665 for (i = len; i < 2*len; i++) {
3666 int start = i-len+1;
3667 int end = start + (len - start)/2;
3668 int j;
3669 for (j = start; j < end; j++) {
3670 MACC2(a[j], a[i-j], t0, t1, t2);
3671 MACC(m[j], n[i-j], t0, t1, t2);
3672 }
3673 if ((i & 1) == 0) {
3674 MACC(a[j], a[j], t0, t1, t2);
3675 }
3676 for (; j < len; j++) {
3677 MACC(m[j], n[i-j], t0, t1, t2);
3678 }
3679 m[i-len] = t0;
3680 t0 = t1; t1 = t2; t2 = 0;
3681 }
3682
3683 while (t0)
3684 t0 = sub(m, n, t0, len);
3685 }
3686
3687 // Swap words in a longword.
3688 static julong swap(julong x) {
3689 return (x << 32) | (x >> 32);
3690 }
3691
3692 // Copy len longwords from s to d, word-swapping as we go. The
3693 // destination array is reversed.
3694 static void reverse_words(julong *s, julong *d, int len) {
3695 d += len;
3696 while(len-- > 0) {
3697 d--;
3698 *d = swap(*s);
3699 s++;
3700 }
3701 }
3702
3703 // The threshold at which squaring is advantageous was determined
3704 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3705 #define MONTGOMERY_SQUARING_THRESHOLD 64
3706
3707 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3708 jint len, jlong inv,
3709 jint *m_ints) {
3710 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3711 int longwords = len/2;
3712
3713 // Make very sure we don't use so much space that the stack might
3714 // overflow. 512 jints corresponds to an 16384-bit integer and
3715 // will use here a total of 8k bytes of stack space.
3716 int divisor = sizeof(julong) * 4;
3717 guarantee(longwords <= 8192 / divisor, "must be");
3718 int total_allocation = longwords * sizeof (julong) * 4;
3719 julong *scratch = (julong *)alloca(total_allocation);
3720
3721 // Local scratch arrays
3722 julong
3723 *a = scratch + 0 * longwords,
3724 *b = scratch + 1 * longwords,
3725 *n = scratch + 2 * longwords,
3726 *m = scratch + 3 * longwords;
3727
3728 reverse_words((julong *)a_ints, a, longwords);
3729 reverse_words((julong *)b_ints, b, longwords);
3730 reverse_words((julong *)n_ints, n, longwords);
3731
3732 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3733
3734 reverse_words(m, (julong *)m_ints, longwords);
3735 }
3736
3737 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3738 jint len, jlong inv,
3739 jint *m_ints) {
3740 assert(len % 2 == 0, "array length in montgomery_square must be even");
3741 int longwords = len/2;
3742
3743 // Make very sure we don't use so much space that the stack might
3744 // overflow. 512 jints corresponds to an 16384-bit integer and
3745 // will use here a total of 6k bytes of stack space.
3746 int divisor = sizeof(julong) * 3;
3747 guarantee(longwords <= (8192 / divisor), "must be");
3748 int total_allocation = longwords * sizeof (julong) * 3;
3749 julong *scratch = (julong *)alloca(total_allocation);
3750
3751 // Local scratch arrays
3752 julong
3753 *a = scratch + 0 * longwords,
3754 *n = scratch + 1 * longwords,
3755 *m = scratch + 2 * longwords;
3756
3757 reverse_words((julong *)a_ints, a, longwords);
3758 reverse_words((julong *)n_ints, n, longwords);
3759
3760 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3761 ::montgomery_square(a, n, m, (julong)inv, longwords);
3762 } else {
3763 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3764 }
3765
3766 reverse_words(m, (julong *)m_ints, longwords);
3767 }
3768
3769 #ifdef COMPILER2
3770 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3771 //
3772 //------------------------------generate_exception_blob---------------------------
3773 // creates exception blob at the end
3774 // Using exception blob, this code is jumped from a compiled method.
3775 // (see emit_exception_handler in x86_64.ad file)
3776 //
3777 // Given an exception pc at a call we call into the runtime for the
3778 // handler in this method. This handler might merely restore state
3779 // (i.e. callee save registers) unwind the frame and jump to the
3780 // exception handler for the nmethod if there is no Java level handler
3781 // for the nmethod.
3782 //
3783 // This code is entered with a jmp.
3784 //
3785 // Arguments:
3786 // rax: exception oop
3787 // rdx: exception pc
3788 //
3789 // Results:
3790 // rax: exception oop
3791 // rdx: exception pc in caller or ???
3792 // destination: exception handler of caller
3793 //
3794 // Note: the exception pc MUST be at a call (precise debug information)
3795 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3796 //
3797
3798 void OptoRuntime::generate_exception_blob() {
3799 assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3800 assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3801 assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3802
3803 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3804
3805 // Allocate space for the code
3806 ResourceMark rm;
3807 // Setup code generation tools
3808 CodeBuffer buffer("exception_blob", 2048, 1024);
3809 MacroAssembler* masm = new MacroAssembler(&buffer);
3810
3811
3812 address start = __ pc();
3813
3814 // Exception pc is 'return address' for stack walker
3815 __ push(rdx);
3816 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3817
3818 // Save callee-saved registers. See x86_64.ad.
3819
3820 // rbp is an implicitly saved callee saved register (i.e., the calling
3821 // convention will save/restore it in the prolog/epilog). Other than that
3822 // there are no callee save registers now that adapter frames are gone.
3823
3824 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3825
3826 // Store exception in Thread object. We cannot pass any arguments to the
3827 // handle_exception call, since we do not want to make any assumption
3828 // about the size of the frame where the exception happened in.
3829 // c_rarg0 is either rdi (Linux) or rcx (Windows).
3830 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3831 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3832
3833 // This call does all the hard work. It checks if an exception handler
3834 // exists in the method.
3835 // If so, it returns the handler address.
3836 // If not, it prepares for stack-unwinding, restoring the callee-save
3837 // registers of the frame being removed.
3838 //
3839 // address OptoRuntime::handle_exception_C(JavaThread* thread)
3840
3841 // At a method handle call, the stack may not be properly aligned
3842 // when returning with an exception.
3843 address the_pc = __ pc();
3844 __ set_last_Java_frame(noreg, noreg, the_pc);
3845 __ mov(c_rarg0, r15_thread);
3846 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3847 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3848
3849 // Set an oopmap for the call site. This oopmap will only be used if we
3850 // are unwinding the stack. Hence, all locations will be dead.
3851 // Callee-saved registers will be the same as the frame above (i.e.,
3852 // handle_exception_stub), since they were restored when we got the
3853 // exception.
3854
3855 OopMapSet* oop_maps = new OopMapSet();
3856
3857 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3858
3859 __ reset_last_Java_frame(false);
3860
3861 // Restore callee-saved registers
3862
3863 // rbp is an implicitly saved callee-saved register (i.e., the calling
3864 // convention will save restore it in prolog/epilog) Other than that
3865 // there are no callee save registers now that adapter frames are gone.
3866
3867 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3868
3869 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3870 __ pop(rdx); // No need for exception pc anymore
3871
3872 // rax: exception handler
3873
3874 // We have a handler in rax (could be deopt blob).
3875 __ mov(r8, rax);
3876
3877 // Get the exception oop
3878 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3879 // Get the exception pc in case we are deoptimized
3880 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3881 #ifdef ASSERT
3882 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3883 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3884 #endif
3885 // Clear the exception oop so GC no longer processes it as a root.
3886 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3887
3888 // rax: exception oop
3889 // r8: exception handler
3890 // rdx: exception pc
3891 // Jump to handler
3892
3893 __ jmp(r8);
3894
3895 // Make sure all code is generated
3896 masm->flush();
3897
3898 // Set exception blob
3899 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3900 }
3901 #endif // COMPILER2
3902
3903 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3904 int total_in_args, const VMRegPair* in_regs,
3905 int total_out_args, VMRegPair* out_regs,
3906 GrowableArray<int>& arg_order,
3907 VMRegPair tmp_vmreg) {
3908 ComputeMoveOrder order(total_in_args, in_regs,
3909 total_out_args, out_regs,
3910 in_sig_bt, arg_order, tmp_vmreg);
3911 }