1 /*
2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #ifndef _WINDOWS
27 #include "alloca.h"
28 #endif
29 #include "asm/macroAssembler.hpp"
30 #include "asm/macroAssembler.inline.hpp"
31 #include "code/debugInfoRec.hpp"
32 #include "code/icBuffer.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/compiledICHolder.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/jniHandles.hpp"
48 #include "runtime/safepointMechanism.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/signature.hpp"
51 #include "runtime/stubRoutines.hpp"
52 #include "runtime/vframeArray.hpp"
53 #include "runtime/vm_version.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/formatBuffer.hpp"
56 #include "vmreg_x86.inline.hpp"
57 #ifdef COMPILER1
58 #include "c1/c1_Runtime1.hpp"
59 #endif
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_JVMCI
64 #include "jvmci/jvmciJavaClasses.hpp"
65 #endif
66
67 #define __ masm->
68
69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
70
71 class SimpleRuntimeFrame {
72
73 public:
74
75 // Most of the runtime stubs have this simple frame layout.
76 // This class exists to make the layout shared in one place.
77 // Offsets are for compiler stack slots, which are jints.
78 enum layout {
79 // The frame sender code expects that rbp will be in the "natural" place and
80 // will override any oopMap setting for it. We must therefore force the layout
81 // so that it agrees with the frame sender code.
82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
83 rbp_off2,
84 return_off, return_off2,
85 framesize
86 };
87 };
88
89 class RegisterSaver {
90 // Capture info about frame layout. Layout offsets are in jint
91 // units because compiler frame slots are jints.
92 #define XSAVE_AREA_BEGIN 160
93 #define XSAVE_AREA_YMM_BEGIN 576
94 #define XSAVE_AREA_OPMASK_BEGIN 1088
95 #define XSAVE_AREA_ZMM_BEGIN 1152
96 #define XSAVE_AREA_UPPERBANK 1664
97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
102 enum layout {
103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
105 DEF_XMM_OFFS(0),
106 DEF_XMM_OFFS(1),
107 // 2..15 are implied in range usage
108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
109 DEF_YMM_OFFS(0),
110 DEF_YMM_OFFS(1),
111 // 2..15 are implied in range usage
112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
113 DEF_OPMASK_OFFS(0),
114 DEF_OPMASK_OFFS(1),
115 // 2..7 are implied in range usage
116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
117 DEF_ZMM_OFFS(0),
118 DEF_ZMM_OFFS(1),
119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
120 DEF_ZMM_UPPER_OFFS(16),
121 DEF_ZMM_UPPER_OFFS(17),
122 // 18..31 are implied in range usage
123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
124 fpu_stateH_end,
125 r15_off, r15H_off,
126 r14_off, r14H_off,
127 r13_off, r13H_off,
128 r12_off, r12H_off,
129 r11_off, r11H_off,
130 r10_off, r10H_off,
131 r9_off, r9H_off,
132 r8_off, r8H_off,
133 rdi_off, rdiH_off,
134 rsi_off, rsiH_off,
135 ignore_off, ignoreH_off, // extra copy of rbp
136 rsp_off, rspH_off,
137 rbx_off, rbxH_off,
138 rdx_off, rdxH_off,
139 rcx_off, rcxH_off,
140 rax_off, raxH_off,
141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
142 align_off, alignH_off,
143 flags_off, flagsH_off,
144 // The frame sender code expects that rbp will be in the "natural" place and
145 // will override any oopMap setting for it. We must therefore force the layout
146 // so that it agrees with the frame sender code.
147 rbp_off, rbpH_off, // copy of rbp we will restore
148 return_off, returnH_off, // slot for return address
149 reg_save_size // size in compiler stack slots
150 };
151
152 public:
153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
155
156 // Offsets into the register save area
157 // Used by deoptimization when it is managing result register
158 // values on its own
159
160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
165
166 // During deoptimization only the result registers need to be restored,
167 // all the other values have already been extracted.
168 static void restore_result_registers(MacroAssembler* masm);
169 };
170
171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
172 int off = 0;
173 int num_xmm_regs = XMMRegisterImpl::number_of_registers;
174 if (UseAVX < 3) {
175 num_xmm_regs = num_xmm_regs/2;
176 }
177 #if COMPILER2_OR_JVMCI
178 if (save_wide_vectors && UseAVX == 0) {
179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
180 }
181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
182 #else
183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
184 #endif
185
186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
190 // CodeBlob frame size is in words.
191 int frame_size_in_words = frame_size_in_bytes / wordSize;
192 *total_frame_words = frame_size_in_words;
193
194 // Save registers, fpu state, and flags.
195 // We assume caller has already pushed the return address onto the
196 // stack, so rsp is 8-byte aligned here.
197 // We push rpb twice in this sequence because we want the real rbp
198 // to be under the return like a normal enter.
199
200 __ enter(); // rsp becomes 16-byte aligned here
201 __ push_CPU_state(); // Push a multiple of 16 bytes
202
203 // push cpu state handles this on EVEX enabled targets
204 if (save_wide_vectors) {
205 // Save upper half of YMM registers(0..15)
206 int base_addr = XSAVE_AREA_YMM_BEGIN;
207 for (int n = 0; n < 16; n++) {
208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
209 }
210 if (VM_Version::supports_evex()) {
211 // Save upper half of ZMM registers(0..15)
212 base_addr = XSAVE_AREA_ZMM_BEGIN;
213 for (int n = 0; n < 16; n++) {
214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
215 }
216 // Save full ZMM registers(16..num_xmm_regs)
217 base_addr = XSAVE_AREA_UPPERBANK;
218 off = 0;
219 int vector_len = Assembler::AVX_512bit;
220 for (int n = 16; n < num_xmm_regs; n++) {
221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
222 }
223 #if COMPILER2_OR_JVMCI
224 base_addr = XSAVE_AREA_OPMASK_BEGIN;
225 off = 0;
226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
228 }
229 #endif
230 }
231 } else {
232 if (VM_Version::supports_evex()) {
233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
234 int base_addr = XSAVE_AREA_UPPERBANK;
235 off = 0;
236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
237 for (int n = 16; n < num_xmm_regs; n++) {
238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
239 }
240 #if COMPILER2_OR_JVMCI
241 base_addr = XSAVE_AREA_OPMASK_BEGIN;
242 off = 0;
243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
245 }
246 #endif
247 }
248 }
249 __ vzeroupper();
250 if (frame::arg_reg_save_area_bytes != 0) {
251 // Allocate argument register save area
252 __ subptr(rsp, frame::arg_reg_save_area_bytes);
253 }
254
255 // Set an oopmap for the call site. This oopmap will map all
256 // oop-registers and debug-info registers as callee-saved. This
257 // will allow deoptimization at this safepoint to find all possible
258 // debug-info recordings, as well as let GC find all oops.
259
260 OopMapSet *oop_maps = new OopMapSet();
261 OopMap* map = new OopMap(frame_size_in_slots, 0);
262
263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
264
265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
269 // rbp location is known implicitly by the frame sender code, needs no oopmap
270 // and the location where rbp was saved by is ignored
271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
282 // on EVEX enabled targets, we get it included in the xsave area
283 off = xmm0_off;
284 int delta = xmm1_off - off;
285 for (int n = 0; n < 16; n++) {
286 XMMRegister xmm_name = as_XMMRegister(n);
287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
288 off += delta;
289 }
290 if (UseAVX > 2) {
291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
292 off = zmm16_off;
293 delta = zmm17_off - off;
294 for (int n = 16; n < num_xmm_regs; n++) {
295 XMMRegister zmm_name = as_XMMRegister(n);
296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
297 off += delta;
298 }
299 }
300
301 #if COMPILER2_OR_JVMCI
302 if (save_wide_vectors) {
303 // Save upper half of YMM registers(0..15)
304 off = ymm0_off;
305 delta = ymm1_off - ymm0_off;
306 for (int n = 0; n < 16; n++) {
307 XMMRegister ymm_name = as_XMMRegister(n);
308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
309 off += delta;
310 }
311 if (VM_Version::supports_evex()) {
312 // Save upper half of ZMM registers(0..15)
313 off = zmm0_off;
314 delta = zmm1_off - zmm0_off;
315 for (int n = 0; n < 16; n++) {
316 XMMRegister zmm_name = as_XMMRegister(n);
317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
318 off += delta;
319 }
320 }
321 }
322 #endif // COMPILER2_OR_JVMCI
323
324 // %%% These should all be a waste but we'll keep things as they were for now
325 if (true) {
326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
330 // rbp location is known implicitly by the frame sender code, needs no oopmap
331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
342 // on EVEX enabled targets, we get it included in the xsave area
343 off = xmm0H_off;
344 delta = xmm1H_off - off;
345 for (int n = 0; n < 16; n++) {
346 XMMRegister xmm_name = as_XMMRegister(n);
347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
348 off += delta;
349 }
350 if (UseAVX > 2) {
351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
352 off = zmm16H_off;
353 delta = zmm17H_off - off;
354 for (int n = 16; n < num_xmm_regs; n++) {
355 XMMRegister zmm_name = as_XMMRegister(n);
356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
357 off += delta;
358 }
359 }
360 }
361
362 return map;
363 }
364
365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
366 int num_xmm_regs = XMMRegisterImpl::number_of_registers;
367 if (UseAVX < 3) {
368 num_xmm_regs = num_xmm_regs/2;
369 }
370 if (frame::arg_reg_save_area_bytes != 0) {
371 // Pop arg register save area
372 __ addptr(rsp, frame::arg_reg_save_area_bytes);
373 }
374
375 #if COMPILER2_OR_JVMCI
376 if (restore_wide_vectors) {
377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
379 }
380 #else
381 assert(!restore_wide_vectors, "vectors are generated only by C2");
382 #endif
383
384 __ vzeroupper();
385
386 // On EVEX enabled targets everything is handled in pop fpu state
387 if (restore_wide_vectors) {
388 // Restore upper half of YMM registers (0..15)
389 int base_addr = XSAVE_AREA_YMM_BEGIN;
390 for (int n = 0; n < 16; n++) {
391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
392 }
393 if (VM_Version::supports_evex()) {
394 // Restore upper half of ZMM registers (0..15)
395 base_addr = XSAVE_AREA_ZMM_BEGIN;
396 for (int n = 0; n < 16; n++) {
397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
398 }
399 // Restore full ZMM registers(16..num_xmm_regs)
400 base_addr = XSAVE_AREA_UPPERBANK;
401 int vector_len = Assembler::AVX_512bit;
402 int off = 0;
403 for (int n = 16; n < num_xmm_regs; n++) {
404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
405 }
406 #if COMPILER2_OR_JVMCI
407 base_addr = XSAVE_AREA_OPMASK_BEGIN;
408 off = 0;
409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
411 }
412 #endif
413 }
414 } else {
415 if (VM_Version::supports_evex()) {
416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
417 int base_addr = XSAVE_AREA_UPPERBANK;
418 int off = 0;
419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
420 for (int n = 16; n < num_xmm_regs; n++) {
421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
422 }
423 #if COMPILER2_OR_JVMCI
424 base_addr = XSAVE_AREA_OPMASK_BEGIN;
425 off = 0;
426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
428 }
429 #endif
430 }
431 }
432
433 // Recover CPU state
434 __ pop_CPU_state();
435 // Get the rbp described implicitly by the calling convention (no oopMap)
436 __ pop(rbp);
437 }
438
439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
440
441 // Just restore result register. Only used by deoptimization. By
442 // now any callee save register that needs to be restored to a c2
443 // caller of the deoptee has been extracted into the vframeArray
444 // and will be stuffed into the c2i adapter we create for later
445 // restoration so only result registers need to be restored here.
446
447 // Restore fp result register
448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
449 // Restore integer result register
450 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
452
453 // Pop all of the register save are off the stack except the return address
454 __ addptr(rsp, return_offset_in_bytes());
455 }
456
457 // Is vector's size (in bytes) bigger than a size saved by default?
458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
459 bool SharedRuntime::is_wide_vector(int size) {
460 return size > 16;
461 }
462
463 // ---------------------------------------------------------------------------
464 // Read the array of BasicTypes from a signature, and compute where the
465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
466 // quantities. Values less than VMRegImpl::stack0 are registers, those above
467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
468 // as framesizes are fixed.
469 // VMRegImpl::stack0 refers to the first slot 0(sp).
470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register
471 // up to RegisterImpl::number_of_registers) are the 64-bit
472 // integer registers.
473
474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
476 // units regardless of build. Of course for i486 there is no 64 bit build
477
478 // The Java calling convention is a "shifted" version of the C ABI.
479 // By skipping the first C ABI register we can call non-static jni methods
480 // with small numbers of arguments without having to shuffle the arguments
481 // at all. Since we control the java ABI we ought to at least get some
482 // advantage out of it.
483
484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
485 VMRegPair *regs,
486 int total_args_passed) {
487
488 // Create the mapping between argument positions and
489 // registers.
490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
492 };
493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
494 j_farg0, j_farg1, j_farg2, j_farg3,
495 j_farg4, j_farg5, j_farg6, j_farg7
496 };
497
498
499 uint int_args = 0;
500 uint fp_args = 0;
501 uint stk_args = 0; // inc by 2 each time
502
503 for (int i = 0; i < total_args_passed; i++) {
504 switch (sig_bt[i]) {
505 case T_BOOLEAN:
506 case T_CHAR:
507 case T_BYTE:
508 case T_SHORT:
509 case T_INT:
510 if (int_args < Argument::n_int_register_parameters_j) {
511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
512 } else {
513 regs[i].set1(VMRegImpl::stack2reg(stk_args));
514 stk_args += 2;
515 }
516 break;
517 case T_VOID:
518 // halves of T_LONG or T_DOUBLE
519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
520 regs[i].set_bad();
521 break;
522 case T_LONG:
523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
524 // fall through
525 case T_OBJECT:
526 case T_ARRAY:
527 case T_ADDRESS:
528 if (int_args < Argument::n_int_register_parameters_j) {
529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
530 } else {
531 regs[i].set2(VMRegImpl::stack2reg(stk_args));
532 stk_args += 2;
533 }
534 break;
535 case T_FLOAT:
536 if (fp_args < Argument::n_float_register_parameters_j) {
537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
538 } else {
539 regs[i].set1(VMRegImpl::stack2reg(stk_args));
540 stk_args += 2;
541 }
542 break;
543 case T_DOUBLE:
544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
545 if (fp_args < Argument::n_float_register_parameters_j) {
546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
547 } else {
548 regs[i].set2(VMRegImpl::stack2reg(stk_args));
549 stk_args += 2;
550 }
551 break;
552 default:
553 ShouldNotReachHere();
554 break;
555 }
556 }
557
558 return align_up(stk_args, 2);
559 }
560
561 // Patch the callers callsite with entry to compiled code if it exists.
562 static void patch_callers_callsite(MacroAssembler *masm) {
563 Label L;
564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
565 __ jcc(Assembler::equal, L);
566
567 // Save the current stack pointer
568 __ mov(r13, rsp);
569 // Schedule the branch target address early.
570 // Call into the VM to patch the caller, then jump to compiled callee
571 // rax isn't live so capture return address while we easily can
572 __ movptr(rax, Address(rsp, 0));
573
574 // align stack so push_CPU_state doesn't fault
575 __ andptr(rsp, -(StackAlignmentInBytes));
576 __ push_CPU_state();
577 __ vzeroupper();
578 // VM needs caller's callsite
579 // VM needs target method
580 // This needs to be a long call since we will relocate this adapter to
581 // the codeBuffer and it may not reach
582
583 // Allocate argument register save area
584 if (frame::arg_reg_save_area_bytes != 0) {
585 __ subptr(rsp, frame::arg_reg_save_area_bytes);
586 }
587 __ mov(c_rarg0, rbx);
588 __ mov(c_rarg1, rax);
589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
590
591 // De-allocate argument register save area
592 if (frame::arg_reg_save_area_bytes != 0) {
593 __ addptr(rsp, frame::arg_reg_save_area_bytes);
594 }
595
596 __ vzeroupper();
597 __ pop_CPU_state();
598 // restore sp
599 __ mov(rsp, r13);
600 __ bind(L);
601 }
602
603
604 static void gen_c2i_adapter(MacroAssembler *masm,
605 int total_args_passed,
606 int comp_args_on_stack,
607 const BasicType *sig_bt,
608 const VMRegPair *regs,
609 Label& skip_fixup) {
610 // Before we get into the guts of the C2I adapter, see if we should be here
611 // at all. We've come from compiled code and are attempting to jump to the
612 // interpreter, which means the caller made a static call to get here
613 // (vcalls always get a compiled target if there is one). Check for a
614 // compiled target. If there is one, we need to patch the caller's call.
615 patch_callers_callsite(masm);
616
617 __ bind(skip_fixup);
618
619 // Since all args are passed on the stack, total_args_passed *
620 // Interpreter::stackElementSize is the space we need. Plus 1 because
621 // we also account for the return address location since
622 // we store it first rather than hold it in rax across all the shuffling
623
624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
625
626 // stack is aligned, keep it that way
627 extraspace = align_up(extraspace, 2*wordSize);
628
629 // Get return address
630 __ pop(rax);
631
632 // set senderSP value
633 __ mov(r13, rsp);
634
635 __ subptr(rsp, extraspace);
636
637 // Store the return address in the expected location
638 __ movptr(Address(rsp, 0), rax);
639
640 // Now write the args into the outgoing interpreter space
641 for (int i = 0; i < total_args_passed; i++) {
642 if (sig_bt[i] == T_VOID) {
643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
644 continue;
645 }
646
647 // offset to start parameters
648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
649 int next_off = st_off - Interpreter::stackElementSize;
650
651 // Say 4 args:
652 // i st_off
653 // 0 32 T_LONG
654 // 1 24 T_VOID
655 // 2 16 T_OBJECT
656 // 3 8 T_BOOL
657 // - 0 return address
658 //
659 // However to make thing extra confusing. Because we can fit a long/double in
660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
661 // leaves one slot empty and only stores to a single slot. In this case the
662 // slot that is occupied is the T_VOID slot. See I said it was confusing.
663
664 VMReg r_1 = regs[i].first();
665 VMReg r_2 = regs[i].second();
666 if (!r_1->is_valid()) {
667 assert(!r_2->is_valid(), "");
668 continue;
669 }
670 if (r_1->is_stack()) {
671 // memory to memory use rax
672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
673 if (!r_2->is_valid()) {
674 // sign extend??
675 __ movl(rax, Address(rsp, ld_off));
676 __ movptr(Address(rsp, st_off), rax);
677
678 } else {
679
680 __ movq(rax, Address(rsp, ld_off));
681
682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
683 // T_DOUBLE and T_LONG use two slots in the interpreter
684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
685 // ld_off == LSW, ld_off+wordSize == MSW
686 // st_off == MSW, next_off == LSW
687 __ movq(Address(rsp, next_off), rax);
688 #ifdef ASSERT
689 // Overwrite the unused slot with known junk
690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
691 __ movptr(Address(rsp, st_off), rax);
692 #endif /* ASSERT */
693 } else {
694 __ movq(Address(rsp, st_off), rax);
695 }
696 }
697 } else if (r_1->is_Register()) {
698 Register r = r_1->as_Register();
699 if (!r_2->is_valid()) {
700 // must be only an int (or less ) so move only 32bits to slot
701 // why not sign extend??
702 __ movl(Address(rsp, st_off), r);
703 } else {
704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
705 // T_DOUBLE and T_LONG use two slots in the interpreter
706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
707 // long/double in gpr
708 #ifdef ASSERT
709 // Overwrite the unused slot with known junk
710 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
711 __ movptr(Address(rsp, st_off), rax);
712 #endif /* ASSERT */
713 __ movq(Address(rsp, next_off), r);
714 } else {
715 __ movptr(Address(rsp, st_off), r);
716 }
717 }
718 } else {
719 assert(r_1->is_XMMRegister(), "");
720 if (!r_2->is_valid()) {
721 // only a float use just part of the slot
722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
723 } else {
724 #ifdef ASSERT
725 // Overwrite the unused slot with known junk
726 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
727 __ movptr(Address(rsp, st_off), rax);
728 #endif /* ASSERT */
729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
730 }
731 }
732 }
733
734 // Schedule the branch target address early.
735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
736 __ jmp(rcx);
737 }
738
739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
740 address code_start, address code_end,
741 Label& L_ok) {
742 Label L_fail;
743 __ lea(temp_reg, ExternalAddress(code_start));
744 __ cmpptr(pc_reg, temp_reg);
745 __ jcc(Assembler::belowEqual, L_fail);
746 __ lea(temp_reg, ExternalAddress(code_end));
747 __ cmpptr(pc_reg, temp_reg);
748 __ jcc(Assembler::below, L_ok);
749 __ bind(L_fail);
750 }
751
752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
753 int total_args_passed,
754 int comp_args_on_stack,
755 const BasicType *sig_bt,
756 const VMRegPair *regs) {
757
758 // Note: r13 contains the senderSP on entry. We must preserve it since
759 // we may do a i2c -> c2i transition if we lose a race where compiled
760 // code goes non-entrant while we get args ready.
761 // In addition we use r13 to locate all the interpreter args as
762 // we must align the stack to 16 bytes on an i2c entry else we
763 // lose alignment we expect in all compiled code and register
764 // save code can segv when fxsave instructions find improperly
765 // aligned stack pointer.
766
767 // Adapters can be frameless because they do not require the caller
768 // to perform additional cleanup work, such as correcting the stack pointer.
769 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
771 // even if a callee has modified the stack pointer.
772 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
773 // routinely repairs its caller's stack pointer (from sender_sp, which is set
774 // up via the senderSP register).
775 // In other words, if *either* the caller or callee is interpreted, we can
776 // get the stack pointer repaired after a call.
777 // This is why c2i and i2c adapters cannot be indefinitely composed.
778 // In particular, if a c2i adapter were to somehow call an i2c adapter,
779 // both caller and callee would be compiled methods, and neither would
780 // clean up the stack pointer changes performed by the two adapters.
781 // If this happens, control eventually transfers back to the compiled
782 // caller, but with an uncorrected stack, causing delayed havoc.
783
784 // Pick up the return address
785 __ movptr(rax, Address(rsp, 0));
786
787 if (VerifyAdapterCalls &&
788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
789 // So, let's test for cascading c2i/i2c adapters right now.
790 // assert(Interpreter::contains($return_addr) ||
791 // StubRoutines::contains($return_addr),
792 // "i2c adapter must return to an interpreter frame");
793 __ block_comment("verify_i2c { ");
794 Label L_ok;
795 if (Interpreter::code() != NULL)
796 range_check(masm, rax, r11,
797 Interpreter::code()->code_start(), Interpreter::code()->code_end(),
798 L_ok);
799 if (StubRoutines::code1() != NULL)
800 range_check(masm, rax, r11,
801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
802 L_ok);
803 if (StubRoutines::code2() != NULL)
804 range_check(masm, rax, r11,
805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
806 L_ok);
807 const char* msg = "i2c adapter must return to an interpreter frame";
808 __ block_comment(msg);
809 __ stop(msg);
810 __ bind(L_ok);
811 __ block_comment("} verify_i2ce ");
812 }
813
814 // Must preserve original SP for loading incoming arguments because
815 // we need to align the outgoing SP for compiled code.
816 __ movptr(r11, rsp);
817
818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed
819 // in registers, we will occasionally have no stack args.
820 int comp_words_on_stack = 0;
821 if (comp_args_on_stack) {
822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in
823 // registers are below. By subtracting stack0, we either get a negative
824 // number (all values in registers) or the maximum stack slot accessed.
825
826 // Convert 4-byte c2 stack slots to words.
827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
828 // Round up to miminum stack alignment, in wordSize
829 comp_words_on_stack = align_up(comp_words_on_stack, 2);
830 __ subptr(rsp, comp_words_on_stack * wordSize);
831 }
832
833
834 // Ensure compiled code always sees stack at proper alignment
835 __ andptr(rsp, -16);
836
837 // push the return address and misalign the stack that youngest frame always sees
838 // as far as the placement of the call instruction
839 __ push(rax);
840
841 // Put saved SP in another register
842 const Register saved_sp = rax;
843 __ movptr(saved_sp, r11);
844
845 // Will jump to the compiled code just as if compiled code was doing it.
846 // Pre-load the register-jump target early, to schedule it better.
847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
848
849 #if INCLUDE_JVMCI
850 if (EnableJVMCI) {
851 // check if this call should be routed towards a specific entry point
852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
853 Label no_alternative_target;
854 __ jcc(Assembler::equal, no_alternative_target);
855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
857 __ bind(no_alternative_target);
858 }
859 #endif // INCLUDE_JVMCI
860
861 // Now generate the shuffle code. Pick up all register args and move the
862 // rest through the floating point stack top.
863 for (int i = 0; i < total_args_passed; i++) {
864 if (sig_bt[i] == T_VOID) {
865 // Longs and doubles are passed in native word order, but misaligned
866 // in the 32-bit build.
867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
868 continue;
869 }
870
871 // Pick up 0, 1 or 2 words from SP+offset.
872
873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
874 "scrambled load targets?");
875 // Load in argument order going down.
876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
877 // Point to interpreter value (vs. tag)
878 int next_off = ld_off - Interpreter::stackElementSize;
879 //
880 //
881 //
882 VMReg r_1 = regs[i].first();
883 VMReg r_2 = regs[i].second();
884 if (!r_1->is_valid()) {
885 assert(!r_2->is_valid(), "");
886 continue;
887 }
888 if (r_1->is_stack()) {
889 // Convert stack slot to an SP offset (+ wordSize to account for return address )
890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
891
892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
893 // and if we end up going thru a c2i because of a miss a reasonable value of r13
894 // will be generated.
895 if (!r_2->is_valid()) {
896 // sign extend???
897 __ movl(r13, Address(saved_sp, ld_off));
898 __ movptr(Address(rsp, st_off), r13);
899 } else {
900 //
901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
903 // So we must adjust where to pick up the data to match the interpreter.
904 //
905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
906 // are accessed as negative so LSW is at LOW address
907
908 // ld_off is MSW so get LSW
909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
910 next_off : ld_off;
911 __ movq(r13, Address(saved_sp, offset));
912 // st_off is LSW (i.e. reg.first())
913 __ movq(Address(rsp, st_off), r13);
914 }
915 } else if (r_1->is_Register()) { // Register argument
916 Register r = r_1->as_Register();
917 assert(r != rax, "must be different");
918 if (r_2->is_valid()) {
919 //
920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
922 // So we must adjust where to pick up the data to match the interpreter.
923
924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
925 next_off : ld_off;
926
927 // this can be a misaligned move
928 __ movq(r, Address(saved_sp, offset));
929 } else {
930 // sign extend and use a full word?
931 __ movl(r, Address(saved_sp, ld_off));
932 }
933 } else {
934 if (!r_2->is_valid()) {
935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
936 } else {
937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
938 }
939 }
940 }
941
942 // 6243940 We might end up in handle_wrong_method if
943 // the callee is deoptimized as we race thru here. If that
944 // happens we don't want to take a safepoint because the
945 // caller frame will look interpreted and arguments are now
946 // "compiled" so it is much better to make this transition
947 // invisible to the stack walking code. Unfortunately if
948 // we try and find the callee by normal means a safepoint
949 // is possible. So we stash the desired callee in the thread
950 // and the vm will find there should this case occur.
951
952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
953
954 // put Method* where a c2i would expect should we end up there
955 // only needed becaus eof c2 resolve stubs return Method* as a result in
956 // rax
957 __ mov(rax, rbx);
958 __ jmp(r11);
959 }
960
961 // ---------------------------------------------------------------
962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
963 int total_args_passed,
964 int comp_args_on_stack,
965 const BasicType *sig_bt,
966 const VMRegPair *regs,
967 AdapterFingerPrint* fingerprint) {
968 address i2c_entry = __ pc();
969
970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
971
972 // -------------------------------------------------------------------------
973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
974 // to the interpreter. The args start out packed in the compiled layout. They
975 // need to be unpacked into the interpreter layout. This will almost always
976 // require some stack space. We grow the current (compiled) stack, then repack
977 // the args. We finally end in a jump to the generic interpreter entry point.
978 // On exit from the interpreter, the interpreter will restore our SP (lest the
979 // compiled code, which relys solely on SP and not RBP, get sick).
980
981 address c2i_unverified_entry = __ pc();
982 Label skip_fixup;
983 Label ok;
984
985 Register holder = rax;
986 Register receiver = j_rarg0;
987 Register temp = rbx;
988
989 {
990 __ load_klass(temp, receiver, rscratch1);
991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
993 __ jcc(Assembler::equal, ok);
994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
995
996 __ bind(ok);
997 // Method might have been compiled since the call site was patched to
998 // interpreted if that is the case treat it as a miss so we can get
999 // the call site corrected.
1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001 __ jcc(Assembler::equal, skip_fixup);
1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003 }
1004
1005 address c2i_entry = __ pc();
1006
1007 // Class initialization barrier for static methods
1008 address c2i_no_clinit_check_entry = NULL;
1009 if (VM_Version::supports_fast_class_init_checks()) {
1010 Label L_skip_barrier;
1011 Register method = rbx;
1012
1013 { // Bypass the barrier for non-static methods
1014 Register flags = rscratch1;
1015 __ movl(flags, Address(method, Method::access_flags_offset()));
1016 __ testl(flags, JVM_ACC_STATIC);
1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018 }
1019
1020 Register klass = rscratch1;
1021 __ load_method_holder(klass, method);
1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023
1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025
1026 __ bind(L_skip_barrier);
1027 c2i_no_clinit_check_entry = __ pc();
1028 }
1029
1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031 bs->c2i_entry_barrier(masm);
1032
1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034
1035 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1036 }
1037
1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1039 VMRegPair *regs,
1040 VMRegPair *regs2,
1041 int total_args_passed) {
1042 assert(regs2 == NULL, "not needed on x86");
1043 // We return the amount of VMRegImpl stack slots we need to reserve for all
1044 // the arguments NOT counting out_preserve_stack_slots.
1045
1046 // NOTE: These arrays will have to change when c1 is ported
1047 #ifdef _WIN64
1048 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1049 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1050 };
1051 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1052 c_farg0, c_farg1, c_farg2, c_farg3
1053 };
1054 #else
1055 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1057 };
1058 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059 c_farg0, c_farg1, c_farg2, c_farg3,
1060 c_farg4, c_farg5, c_farg6, c_farg7
1061 };
1062 #endif // _WIN64
1063
1064
1065 uint int_args = 0;
1066 uint fp_args = 0;
1067 uint stk_args = 0; // inc by 2 each time
1068
1069 for (int i = 0; i < total_args_passed; i++) {
1070 switch (sig_bt[i]) {
1071 case T_BOOLEAN:
1072 case T_CHAR:
1073 case T_BYTE:
1074 case T_SHORT:
1075 case T_INT:
1076 if (int_args < Argument::n_int_register_parameters_c) {
1077 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1078 #ifdef _WIN64
1079 fp_args++;
1080 // Allocate slots for callee to stuff register args the stack.
1081 stk_args += 2;
1082 #endif
1083 } else {
1084 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1085 stk_args += 2;
1086 }
1087 break;
1088 case T_LONG:
1089 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1090 // fall through
1091 case T_OBJECT:
1092 case T_ARRAY:
1093 case T_ADDRESS:
1094 case T_METADATA:
1095 if (int_args < Argument::n_int_register_parameters_c) {
1096 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098 fp_args++;
1099 stk_args += 2;
1100 #endif
1101 } else {
1102 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1103 stk_args += 2;
1104 }
1105 break;
1106 case T_FLOAT:
1107 if (fp_args < Argument::n_float_register_parameters_c) {
1108 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1109 #ifdef _WIN64
1110 int_args++;
1111 // Allocate slots for callee to stuff register args the stack.
1112 stk_args += 2;
1113 #endif
1114 } else {
1115 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1116 stk_args += 2;
1117 }
1118 break;
1119 case T_DOUBLE:
1120 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1121 if (fp_args < Argument::n_float_register_parameters_c) {
1122 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1123 #ifdef _WIN64
1124 int_args++;
1125 // Allocate slots for callee to stuff register args the stack.
1126 stk_args += 2;
1127 #endif
1128 } else {
1129 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1130 stk_args += 2;
1131 }
1132 break;
1133 case T_VOID: // Halves of longs and doubles
1134 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1135 regs[i].set_bad();
1136 break;
1137 default:
1138 ShouldNotReachHere();
1139 break;
1140 }
1141 }
1142 #ifdef _WIN64
1143 // windows abi requires that we always allocate enough stack space
1144 // for 4 64bit registers to be stored down.
1145 if (stk_args < 8) {
1146 stk_args = 8;
1147 }
1148 #endif // _WIN64
1149
1150 return stk_args;
1151 }
1152
1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1154 uint num_bits,
1155 uint total_args_passed) {
1156 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1157 "only certain vector sizes are supported for now");
1158
1159 static const XMMRegister VEC_ArgReg[32] = {
1160 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1161 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1162 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1163 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1164 };
1165
1166 uint stk_args = 0;
1167 uint fp_args = 0;
1168
1169 for (uint i = 0; i < total_args_passed; i++) {
1170 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1171 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1172 regs[i].set_pair(vmreg->next(next_val), vmreg);
1173 }
1174
1175 return stk_args;
1176 }
1177
1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1179 // We always ignore the frame_slots arg and just use the space just below frame pointer
1180 // which by this time is free to use
1181 switch (ret_type) {
1182 case T_FLOAT:
1183 __ movflt(Address(rbp, -wordSize), xmm0);
1184 break;
1185 case T_DOUBLE:
1186 __ movdbl(Address(rbp, -wordSize), xmm0);
1187 break;
1188 case T_VOID: break;
1189 default: {
1190 __ movptr(Address(rbp, -wordSize), rax);
1191 }
1192 }
1193 }
1194
1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1196 // We always ignore the frame_slots arg and just use the space just below frame pointer
1197 // which by this time is free to use
1198 switch (ret_type) {
1199 case T_FLOAT:
1200 __ movflt(xmm0, Address(rbp, -wordSize));
1201 break;
1202 case T_DOUBLE:
1203 __ movdbl(xmm0, Address(rbp, -wordSize));
1204 break;
1205 case T_VOID: break;
1206 default: {
1207 __ movptr(rax, Address(rbp, -wordSize));
1208 }
1209 }
1210 }
1211
1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1213 for ( int i = first_arg ; i < arg_count ; i++ ) {
1214 if (args[i].first()->is_Register()) {
1215 __ push(args[i].first()->as_Register());
1216 } else if (args[i].first()->is_XMMRegister()) {
1217 __ subptr(rsp, 2*wordSize);
1218 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1219 }
1220 }
1221 }
1222
1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1225 if (args[i].first()->is_Register()) {
1226 __ pop(args[i].first()->as_Register());
1227 } else if (args[i].first()->is_XMMRegister()) {
1228 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1229 __ addptr(rsp, 2*wordSize);
1230 }
1231 }
1232 }
1233
1234 // Unpack an array argument into a pointer to the body and the length
1235 // if the array is non-null, otherwise pass 0 for both.
1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1237 Register tmp_reg = rax;
1238 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1239 "possible collision");
1240 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1241 "possible collision");
1242
1243 __ block_comment("unpack_array_argument {");
1244
1245 // Pass the length, ptr pair
1246 Label is_null, done;
1247 VMRegPair tmp;
1248 tmp.set_ptr(tmp_reg->as_VMReg());
1249 if (reg.first()->is_stack()) {
1250 // Load the arg up from the stack
1251 __ move_ptr(reg, tmp);
1252 reg = tmp;
1253 }
1254 __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1255 __ jccb(Assembler::equal, is_null);
1256 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1257 __ move_ptr(tmp, body_arg);
1258 // load the length relative to the body.
1259 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1260 arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1261 __ move32_64(tmp, length_arg);
1262 __ jmpb(done);
1263 __ bind(is_null);
1264 // Pass zeros
1265 __ xorptr(tmp_reg, tmp_reg);
1266 __ move_ptr(tmp, body_arg);
1267 __ move32_64(tmp, length_arg);
1268 __ bind(done);
1269
1270 __ block_comment("} unpack_array_argument");
1271 }
1272
1273
1274 // Different signatures may require very different orders for the move
1275 // to avoid clobbering other arguments. There's no simple way to
1276 // order them safely. Compute a safe order for issuing stores and
1277 // break any cycles in those stores. This code is fairly general but
1278 // it's not necessary on the other platforms so we keep it in the
1279 // platform dependent code instead of moving it into a shared file.
1280 // (See bugs 7013347 & 7145024.)
1281 // Note that this code is specific to LP64.
1282 class ComputeMoveOrder: public StackObj {
1283 class MoveOperation: public ResourceObj {
1284 friend class ComputeMoveOrder;
1285 private:
1286 VMRegPair _src;
1287 VMRegPair _dst;
1288 int _src_index;
1289 int _dst_index;
1290 bool _processed;
1291 MoveOperation* _next;
1292 MoveOperation* _prev;
1293
1294 static int get_id(VMRegPair r) {
1295 return r.first()->value();
1296 }
1297
1298 public:
1299 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1300 _src(src)
1301 , _dst(dst)
1302 , _src_index(src_index)
1303 , _dst_index(dst_index)
1304 , _processed(false)
1305 , _next(NULL)
1306 , _prev(NULL) {
1307 }
1308
1309 VMRegPair src() const { return _src; }
1310 int src_id() const { return get_id(src()); }
1311 int src_index() const { return _src_index; }
1312 VMRegPair dst() const { return _dst; }
1313 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1314 int dst_index() const { return _dst_index; }
1315 int dst_id() const { return get_id(dst()); }
1316 MoveOperation* next() const { return _next; }
1317 MoveOperation* prev() const { return _prev; }
1318 void set_processed() { _processed = true; }
1319 bool is_processed() const { return _processed; }
1320
1321 // insert
1322 void break_cycle(VMRegPair temp_register) {
1323 // create a new store following the last store
1324 // to move from the temp_register to the original
1325 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1326
1327 // break the cycle of links and insert new_store at the end
1328 // break the reverse link.
1329 MoveOperation* p = prev();
1330 assert(p->next() == this, "must be");
1331 _prev = NULL;
1332 p->_next = new_store;
1333 new_store->_prev = p;
1334
1335 // change the original store to save it's value in the temp.
1336 set_dst(-1, temp_register);
1337 }
1338
1339 void link(GrowableArray<MoveOperation*>& killer) {
1340 // link this store in front the store that it depends on
1341 MoveOperation* n = killer.at_grow(src_id(), NULL);
1342 if (n != NULL) {
1343 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1344 _next = n;
1345 n->_prev = this;
1346 }
1347 }
1348 };
1349
1350 private:
1351 GrowableArray<MoveOperation*> edges;
1352
1353 public:
1354 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1355 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1356 // Move operations where the dest is the stack can all be
1357 // scheduled first since they can't interfere with the other moves.
1358 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1359 if (in_sig_bt[i] == T_ARRAY) {
1360 c_arg--;
1361 if (out_regs[c_arg].first()->is_stack() &&
1362 out_regs[c_arg + 1].first()->is_stack()) {
1363 arg_order.push(i);
1364 arg_order.push(c_arg);
1365 } else {
1366 if (out_regs[c_arg].first()->is_stack() ||
1367 in_regs[i].first() == out_regs[c_arg].first()) {
1368 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1369 } else {
1370 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1371 }
1372 }
1373 } else if (in_sig_bt[i] == T_VOID) {
1374 arg_order.push(i);
1375 arg_order.push(c_arg);
1376 } else {
1377 if (out_regs[c_arg].first()->is_stack() ||
1378 in_regs[i].first() == out_regs[c_arg].first()) {
1379 arg_order.push(i);
1380 arg_order.push(c_arg);
1381 } else {
1382 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1383 }
1384 }
1385 }
1386 // Break any cycles in the register moves and emit the in the
1387 // proper order.
1388 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1389 for (int i = 0; i < stores->length(); i++) {
1390 arg_order.push(stores->at(i)->src_index());
1391 arg_order.push(stores->at(i)->dst_index());
1392 }
1393 }
1394
1395 // Collected all the move operations
1396 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1397 if (src.first() == dst.first()) return;
1398 edges.append(new MoveOperation(src_index, src, dst_index, dst));
1399 }
1400
1401 // Walk the edges breaking cycles between moves. The result list
1402 // can be walked in order to produce the proper set of loads
1403 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1404 // Record which moves kill which values
1405 GrowableArray<MoveOperation*> killer;
1406 for (int i = 0; i < edges.length(); i++) {
1407 MoveOperation* s = edges.at(i);
1408 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1409 killer.at_put_grow(s->dst_id(), s, NULL);
1410 }
1411 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1412 "make sure temp isn't in the registers that are killed");
1413
1414 // create links between loads and stores
1415 for (int i = 0; i < edges.length(); i++) {
1416 edges.at(i)->link(killer);
1417 }
1418
1419 // at this point, all the move operations are chained together
1420 // in a doubly linked list. Processing it backwards finds
1421 // the beginning of the chain, forwards finds the end. If there's
1422 // a cycle it can be broken at any point, so pick an edge and walk
1423 // backward until the list ends or we end where we started.
1424 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1425 for (int e = 0; e < edges.length(); e++) {
1426 MoveOperation* s = edges.at(e);
1427 if (!s->is_processed()) {
1428 MoveOperation* start = s;
1429 // search for the beginning of the chain or cycle
1430 while (start->prev() != NULL && start->prev() != s) {
1431 start = start->prev();
1432 }
1433 if (start->prev() == s) {
1434 start->break_cycle(temp_register);
1435 }
1436 // walk the chain forward inserting to store list
1437 while (start != NULL) {
1438 stores->append(start);
1439 start->set_processed();
1440 start = start->next();
1441 }
1442 }
1443 }
1444 return stores;
1445 }
1446 };
1447
1448 static void verify_oop_args(MacroAssembler* masm,
1449 const methodHandle& method,
1450 const BasicType* sig_bt,
1451 const VMRegPair* regs) {
1452 Register temp_reg = rbx; // not part of any compiled calling seq
1453 if (VerifyOops) {
1454 for (int i = 0; i < method->size_of_parameters(); i++) {
1455 if (is_reference_type(sig_bt[i])) {
1456 VMReg r = regs[i].first();
1457 assert(r->is_valid(), "bad oop arg");
1458 if (r->is_stack()) {
1459 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1460 __ verify_oop(temp_reg);
1461 } else {
1462 __ verify_oop(r->as_Register());
1463 }
1464 }
1465 }
1466 }
1467 }
1468
1469 static void gen_special_dispatch(MacroAssembler* masm,
1470 const methodHandle& method,
1471 const BasicType* sig_bt,
1472 const VMRegPair* regs) {
1473 verify_oop_args(masm, method, sig_bt, regs);
1474 vmIntrinsics::ID iid = method->intrinsic_id();
1475
1476 // Now write the args into the outgoing interpreter space
1477 bool has_receiver = false;
1478 Register receiver_reg = noreg;
1479 int member_arg_pos = -1;
1480 Register member_reg = noreg;
1481 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1482 if (ref_kind != 0) {
1483 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1484 member_reg = rbx; // known to be free at this point
1485 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1486 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1487 has_receiver = true;
1488 } else {
1489 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1490 }
1491
1492 if (member_reg != noreg) {
1493 // Load the member_arg into register, if necessary.
1494 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1495 VMReg r = regs[member_arg_pos].first();
1496 if (r->is_stack()) {
1497 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1498 } else {
1499 // no data motion is needed
1500 member_reg = r->as_Register();
1501 }
1502 }
1503
1504 if (has_receiver) {
1505 // Make sure the receiver is loaded into a register.
1506 assert(method->size_of_parameters() > 0, "oob");
1507 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1508 VMReg r = regs[0].first();
1509 assert(r->is_valid(), "bad receiver arg");
1510 if (r->is_stack()) {
1511 // Porting note: This assumes that compiled calling conventions always
1512 // pass the receiver oop in a register. If this is not true on some
1513 // platform, pick a temp and load the receiver from stack.
1514 fatal("receiver always in a register");
1515 receiver_reg = j_rarg0; // known to be free at this point
1516 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1517 } else {
1518 // no data motion is needed
1519 receiver_reg = r->as_Register();
1520 }
1521 }
1522
1523 // Figure out which address we are really jumping to:
1524 MethodHandles::generate_method_handle_dispatch(masm, iid,
1525 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1526 }
1527
1528 // ---------------------------------------------------------------------------
1529 // Generate a native wrapper for a given method. The method takes arguments
1530 // in the Java compiled code convention, marshals them to the native
1531 // convention (handlizes oops, etc), transitions to native, makes the call,
1532 // returns to java state (possibly blocking), unhandlizes any result and
1533 // returns.
1534 //
1535 // Critical native functions are a shorthand for the use of
1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1537 // functions. The wrapper is expected to unpack the arguments before
1538 // passing them to the callee. Critical native functions leave the state _in_Java,
1539 // since they cannot stop for GC.
1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1541 // block and the check for pending exceptions it's impossible for them
1542 // to be thrown.
1543 //
1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1545 const methodHandle& method,
1546 int compile_id,
1547 BasicType* in_sig_bt,
1548 VMRegPair* in_regs,
1549 BasicType ret_type,
1550 address critical_entry) {
1551 if (method->is_method_handle_intrinsic()) {
1552 vmIntrinsics::ID iid = method->intrinsic_id();
1553 intptr_t start = (intptr_t)__ pc();
1554 int vep_offset = ((intptr_t)__ pc()) - start;
1555 gen_special_dispatch(masm,
1556 method,
1557 in_sig_bt,
1558 in_regs);
1559 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1560 __ flush();
1561 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1562 return nmethod::new_native_nmethod(method,
1563 compile_id,
1564 masm->code(),
1565 vep_offset,
1566 frame_complete,
1567 stack_slots / VMRegImpl::slots_per_word,
1568 in_ByteSize(-1),
1569 in_ByteSize(-1),
1570 (OopMapSet*)NULL);
1571 }
1572 bool is_critical_native = true;
1573 address native_func = critical_entry;
1574 if (native_func == NULL) {
1575 native_func = method->native_function();
1576 is_critical_native = false;
1577 }
1578 assert(native_func != NULL, "must have function");
1579
1580 // An OopMap for lock (and class if static)
1581 OopMapSet *oop_maps = new OopMapSet();
1582 intptr_t start = (intptr_t)__ pc();
1583
1584 // We have received a description of where all the java arg are located
1585 // on entry to the wrapper. We need to convert these args to where
1586 // the jni function will expect them. To figure out where they go
1587 // we convert the java signature to a C signature by inserting
1588 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1589
1590 const int total_in_args = method->size_of_parameters();
1591 int total_c_args = total_in_args;
1592 if (!is_critical_native) {
1593 total_c_args += 1;
1594 if (method->is_static()) {
1595 total_c_args++;
1596 }
1597 } else {
1598 for (int i = 0; i < total_in_args; i++) {
1599 if (in_sig_bt[i] == T_ARRAY) {
1600 total_c_args++;
1601 }
1602 }
1603 }
1604
1605 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1606 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1607 BasicType* in_elem_bt = NULL;
1608
1609 int argc = 0;
1610 if (!is_critical_native) {
1611 out_sig_bt[argc++] = T_ADDRESS;
1612 if (method->is_static()) {
1613 out_sig_bt[argc++] = T_OBJECT;
1614 }
1615
1616 for (int i = 0; i < total_in_args ; i++ ) {
1617 out_sig_bt[argc++] = in_sig_bt[i];
1618 }
1619 } else {
1620 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1621 SignatureStream ss(method->signature());
1622 for (int i = 0; i < total_in_args ; i++ ) {
1623 if (in_sig_bt[i] == T_ARRAY) {
1624 // Arrays are passed as int, elem* pair
1625 out_sig_bt[argc++] = T_INT;
1626 out_sig_bt[argc++] = T_ADDRESS;
1627 ss.skip_array_prefix(1); // skip one '['
1628 assert(ss.is_primitive(), "primitive type expected");
1629 in_elem_bt[i] = ss.type();
1630 } else {
1631 out_sig_bt[argc++] = in_sig_bt[i];
1632 in_elem_bt[i] = T_VOID;
1633 }
1634 if (in_sig_bt[i] != T_VOID) {
1635 assert(in_sig_bt[i] == ss.type() ||
1636 in_sig_bt[i] == T_ARRAY, "must match");
1637 ss.next();
1638 }
1639 }
1640 }
1641
1642 // Now figure out where the args must be stored and how much stack space
1643 // they require.
1644 int out_arg_slots;
1645 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1646
1647 // Compute framesize for the wrapper. We need to handlize all oops in
1648 // incoming registers
1649
1650 // Calculate the total number of stack slots we will need.
1651
1652 // First count the abi requirement plus all of the outgoing args
1653 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1654
1655 // Now the space for the inbound oop handle area
1656 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1657 if (is_critical_native) {
1658 // Critical natives may have to call out so they need a save area
1659 // for register arguments.
1660 int double_slots = 0;
1661 int single_slots = 0;
1662 for ( int i = 0; i < total_in_args; i++) {
1663 if (in_regs[i].first()->is_Register()) {
1664 const Register reg = in_regs[i].first()->as_Register();
1665 switch (in_sig_bt[i]) {
1666 case T_BOOLEAN:
1667 case T_BYTE:
1668 case T_SHORT:
1669 case T_CHAR:
1670 case T_INT: single_slots++; break;
1671 case T_ARRAY: // specific to LP64 (7145024)
1672 case T_LONG: double_slots++; break;
1673 default: ShouldNotReachHere();
1674 }
1675 } else if (in_regs[i].first()->is_XMMRegister()) {
1676 switch (in_sig_bt[i]) {
1677 case T_FLOAT: single_slots++; break;
1678 case T_DOUBLE: double_slots++; break;
1679 default: ShouldNotReachHere();
1680 }
1681 } else if (in_regs[i].first()->is_FloatRegister()) {
1682 ShouldNotReachHere();
1683 }
1684 }
1685 total_save_slots = double_slots * 2 + single_slots;
1686 // align the save area
1687 if (double_slots != 0) {
1688 stack_slots = align_up(stack_slots, 2);
1689 }
1690 }
1691
1692 int oop_handle_offset = stack_slots;
1693 stack_slots += total_save_slots;
1694
1695 // Now any space we need for handlizing a klass if static method
1696
1697 int klass_slot_offset = 0;
1698 int klass_offset = -1;
1699 int lock_slot_offset = 0;
1700 bool is_static = false;
1701
1702 if (method->is_static()) {
1703 klass_slot_offset = stack_slots;
1704 stack_slots += VMRegImpl::slots_per_word;
1705 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1706 is_static = true;
1707 }
1708
1709 // Plus a lock if needed
1710
1711 if (method->is_synchronized()) {
1712 lock_slot_offset = stack_slots;
1713 stack_slots += VMRegImpl::slots_per_word;
1714 }
1715
1716 // Now a place (+2) to save return values or temp during shuffling
1717 // + 4 for return address (which we own) and saved rbp
1718 stack_slots += 6;
1719
1720 // Ok The space we have allocated will look like:
1721 //
1722 //
1723 // FP-> | |
1724 // |---------------------|
1725 // | 2 slots for moves |
1726 // |---------------------|
1727 // | lock box (if sync) |
1728 // |---------------------| <- lock_slot_offset
1729 // | klass (if static) |
1730 // |---------------------| <- klass_slot_offset
1731 // | oopHandle area |
1732 // |---------------------| <- oop_handle_offset (6 java arg registers)
1733 // | outbound memory |
1734 // | based arguments |
1735 // | |
1736 // |---------------------|
1737 // | |
1738 // SP-> | out_preserved_slots |
1739 //
1740 //
1741
1742
1743 // Now compute actual number of stack words we need rounding to make
1744 // stack properly aligned.
1745 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1746
1747 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1748
1749 // First thing make an ic check to see if we should even be here
1750
1751 // We are free to use all registers as temps without saving them and
1752 // restoring them except rbp. rbp is the only callee save register
1753 // as far as the interpreter and the compiler(s) are concerned.
1754
1755
1756 const Register ic_reg = rax;
1757 const Register receiver = j_rarg0;
1758
1759 Label hit;
1760 Label exception_pending;
1761
1762 assert_different_registers(ic_reg, receiver, rscratch1);
1763 __ verify_oop(receiver);
1764 __ load_klass(rscratch1, receiver, rscratch2);
1765 __ cmpq(ic_reg, rscratch1);
1766 __ jcc(Assembler::equal, hit);
1767
1768 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1769
1770 // Verified entry point must be aligned
1771 __ align(8);
1772
1773 __ bind(hit);
1774
1775 int vep_offset = ((intptr_t)__ pc()) - start;
1776
1777 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1778 Label L_skip_barrier;
1779 Register klass = r10;
1780 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1781 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1782
1783 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1784
1785 __ bind(L_skip_barrier);
1786 }
1787
1788 #ifdef COMPILER1
1789 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1790 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1791 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1792 }
1793 #endif // COMPILER1
1794
1795 // The instruction at the verified entry point must be 5 bytes or longer
1796 // because it can be patched on the fly by make_non_entrant. The stack bang
1797 // instruction fits that requirement.
1798
1799 // Generate stack overflow check
1800 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1801
1802 // Generate a new frame for the wrapper.
1803 __ enter();
1804 // -2 because return address is already present and so is saved rbp
1805 __ subptr(rsp, stack_size - 2*wordSize);
1806
1807 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1808 bs->nmethod_entry_barrier(masm);
1809
1810 // Frame is now completed as far as size and linkage.
1811 int frame_complete = ((intptr_t)__ pc()) - start;
1812
1813 if (UseRTMLocking) {
1814 // Abort RTM transaction before calling JNI
1815 // because critical section will be large and will be
1816 // aborted anyway. Also nmethod could be deoptimized.
1817 __ xabort(0);
1818 }
1819
1820 #ifdef ASSERT
1821 {
1822 Label L;
1823 __ mov(rax, rsp);
1824 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1825 __ cmpptr(rax, rsp);
1826 __ jcc(Assembler::equal, L);
1827 __ stop("improperly aligned stack");
1828 __ bind(L);
1829 }
1830 #endif /* ASSERT */
1831
1832
1833 // We use r14 as the oop handle for the receiver/klass
1834 // It is callee save so it survives the call to native
1835
1836 const Register oop_handle_reg = r14;
1837
1838 //
1839 // We immediately shuffle the arguments so that any vm call we have to
1840 // make from here on out (sync slow path, jvmti, etc.) we will have
1841 // captured the oops from our caller and have a valid oopMap for
1842 // them.
1843
1844 // -----------------
1845 // The Grand Shuffle
1846
1847 // The Java calling convention is either equal (linux) or denser (win64) than the
1848 // c calling convention. However the because of the jni_env argument the c calling
1849 // convention always has at least one more (and two for static) arguments than Java.
1850 // Therefore if we move the args from java -> c backwards then we will never have
1851 // a register->register conflict and we don't have to build a dependency graph
1852 // and figure out how to break any cycles.
1853 //
1854
1855 // Record esp-based slot for receiver on stack for non-static methods
1856 int receiver_offset = -1;
1857
1858 // This is a trick. We double the stack slots so we can claim
1859 // the oops in the caller's frame. Since we are sure to have
1860 // more args than the caller doubling is enough to make
1861 // sure we can capture all the incoming oop args from the
1862 // caller.
1863 //
1864 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1865
1866 // Mark location of rbp (someday)
1867 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1868
1869 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1870 // All inbound args are referenced based on rbp and all outbound args via rsp.
1871
1872
1873 #ifdef ASSERT
1874 bool reg_destroyed[RegisterImpl::number_of_registers];
1875 bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1876 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1877 reg_destroyed[r] = false;
1878 }
1879 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1880 freg_destroyed[f] = false;
1881 }
1882
1883 #endif /* ASSERT */
1884
1885 // This may iterate in two different directions depending on the
1886 // kind of native it is. The reason is that for regular JNI natives
1887 // the incoming and outgoing registers are offset upwards and for
1888 // critical natives they are offset down.
1889 GrowableArray<int> arg_order(2 * total_in_args);
1890
1891 VMRegPair tmp_vmreg;
1892 tmp_vmreg.set2(rbx->as_VMReg());
1893
1894 if (!is_critical_native) {
1895 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1896 arg_order.push(i);
1897 arg_order.push(c_arg);
1898 }
1899 } else {
1900 // Compute a valid move order, using tmp_vmreg to break any cycles
1901 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1902 }
1903
1904 int temploc = -1;
1905 for (int ai = 0; ai < arg_order.length(); ai += 2) {
1906 int i = arg_order.at(ai);
1907 int c_arg = arg_order.at(ai + 1);
1908 __ block_comment(err_msg("move %d -> %d", i, c_arg));
1909 if (c_arg == -1) {
1910 assert(is_critical_native, "should only be required for critical natives");
1911 // This arg needs to be moved to a temporary
1912 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1913 in_regs[i] = tmp_vmreg;
1914 temploc = i;
1915 continue;
1916 } else if (i == -1) {
1917 assert(is_critical_native, "should only be required for critical natives");
1918 // Read from the temporary location
1919 assert(temploc != -1, "must be valid");
1920 i = temploc;
1921 temploc = -1;
1922 }
1923 #ifdef ASSERT
1924 if (in_regs[i].first()->is_Register()) {
1925 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1926 } else if (in_regs[i].first()->is_XMMRegister()) {
1927 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1928 }
1929 if (out_regs[c_arg].first()->is_Register()) {
1930 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1931 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1932 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1933 }
1934 #endif /* ASSERT */
1935 switch (in_sig_bt[i]) {
1936 case T_ARRAY:
1937 if (is_critical_native) {
1938 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1939 c_arg++;
1940 #ifdef ASSERT
1941 if (out_regs[c_arg].first()->is_Register()) {
1942 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1943 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1944 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1945 }
1946 #endif
1947 break;
1948 }
1949 case T_OBJECT:
1950 assert(!is_critical_native, "no oop arguments");
1951 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1952 ((i == 0) && (!is_static)),
1953 &receiver_offset);
1954 break;
1955 case T_VOID:
1956 break;
1957
1958 case T_FLOAT:
1959 __ float_move(in_regs[i], out_regs[c_arg]);
1960 break;
1961
1962 case T_DOUBLE:
1963 assert( i + 1 < total_in_args &&
1964 in_sig_bt[i + 1] == T_VOID &&
1965 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1966 __ double_move(in_regs[i], out_regs[c_arg]);
1967 break;
1968
1969 case T_LONG :
1970 __ long_move(in_regs[i], out_regs[c_arg]);
1971 break;
1972
1973 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1974
1975 default:
1976 __ move32_64(in_regs[i], out_regs[c_arg]);
1977 }
1978 }
1979
1980 int c_arg;
1981
1982 // Pre-load a static method's oop into r14. Used both by locking code and
1983 // the normal JNI call code.
1984 if (!is_critical_native) {
1985 // point c_arg at the first arg that is already loaded in case we
1986 // need to spill before we call out
1987 c_arg = total_c_args - total_in_args;
1988
1989 if (method->is_static()) {
1990
1991 // load oop into a register
1992 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1993
1994 // Now handlize the static class mirror it's known not-null.
1995 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1996 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1997
1998 // Now get the handle
1999 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2000 // store the klass handle as second argument
2001 __ movptr(c_rarg1, oop_handle_reg);
2002 // and protect the arg if we must spill
2003 c_arg--;
2004 }
2005 } else {
2006 // For JNI critical methods we need to save all registers in save_args.
2007 c_arg = 0;
2008 }
2009
2010 // Change state to native (we save the return address in the thread, since it might not
2011 // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2012 // points into the right code segment. It does not have to be the correct return pc.
2013 // We use the same pc/oopMap repeatedly when we call out
2014
2015 intptr_t the_pc = (intptr_t) __ pc();
2016 oop_maps->add_gc_map(the_pc - start, map);
2017
2018 __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2019
2020
2021 // We have all of the arguments setup at this point. We must not touch any register
2022 // argument registers at this point (what if we save/restore them there are no oop?
2023
2024 {
2025 SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2026 // protect the args we've loaded
2027 save_args(masm, total_c_args, c_arg, out_regs);
2028 __ mov_metadata(c_rarg1, method());
2029 __ call_VM_leaf(
2030 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2031 r15_thread, c_rarg1);
2032 restore_args(masm, total_c_args, c_arg, out_regs);
2033 }
2034
2035 // RedefineClasses() tracing support for obsolete method entry
2036 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2037 // protect the args we've loaded
2038 save_args(masm, total_c_args, c_arg, out_regs);
2039 __ mov_metadata(c_rarg1, method());
2040 __ call_VM_leaf(
2041 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2042 r15_thread, c_rarg1);
2043 restore_args(masm, total_c_args, c_arg, out_regs);
2044 }
2045
2046 // Lock a synchronized method
2047
2048 // Register definitions used by locking and unlocking
2049
2050 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2051 const Register obj_reg = rbx; // Will contain the oop
2052 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2053 const Register old_hdr = r13; // value of old header at unlock time
2054
2055 Label slow_path_lock;
2056 Label lock_done;
2057
2058 if (method->is_synchronized()) {
2059 assert(!is_critical_native, "unhandled");
2060
2061
2062 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2063
2064 // Get the handle (the 2nd argument)
2065 __ mov(oop_handle_reg, c_rarg1);
2066
2067 // Get address of the box
2068
2069 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2070
2071 // Load the oop from the handle
2072 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2073
2074 if (LockingMode == LM_MONITOR) {
2075 __ jmp(slow_path_lock);
2076 } else if (LockingMode == LM_LEGACY) {
2077 if (UseBiasedLocking) {
2078 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2079 }
2080
2081 // Load immediate 1 into swap_reg %rax
2082 __ movl(swap_reg, 1);
2083
2084 // Load (object->mark() | 1) into swap_reg %rax
2085 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2086
2087 // Save (object->mark() | 1) into BasicLock's displaced header
2088 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2089
2090 // src -> dest iff dest == rax else rax <- dest
2091 __ lock();
2092 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2093 __ jcc(Assembler::equal, lock_done);
2094
2095 // Hmm should this move to the slow path code area???
2096
2097 // Test if the oopMark is an obvious stack pointer, i.e.,
2098 // 1) (mark & 3) == 0, and
2099 // 2) rsp <= mark < mark + os::pagesize()
2100 // These 3 tests can be done by evaluating the following
2101 // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2102 // assuming both stack pointer and pagesize have their
2103 // least significant 2 bits clear.
2104 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2105
2106 __ subptr(swap_reg, rsp);
2107 __ andptr(swap_reg, 3 - os::vm_page_size());
2108
2109 // Save the test result, for recursive case, the result is zero
2110 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2111 __ jcc(Assembler::notEqual, slow_path_lock);
2112 } else {
2113 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2114 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2115 }
2116
2117 // Slow path will re-enter here
2118
2119 __ bind(lock_done);
2120 }
2121
2122 // Finally just about ready to make the JNI call
2123
2124 // get JNIEnv* which is first argument to native
2125 if (!is_critical_native) {
2126 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2127
2128 // Now set thread in native
2129 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2130 }
2131
2132 __ call(RuntimeAddress(native_func));
2133
2134 // Verify or restore cpu control state after JNI call
2135 __ restore_cpu_control_state_after_jni();
2136
2137 // Unpack native results.
2138 switch (ret_type) {
2139 case T_BOOLEAN: __ c2bool(rax); break;
2140 case T_CHAR : __ movzwl(rax, rax); break;
2141 case T_BYTE : __ sign_extend_byte (rax); break;
2142 case T_SHORT : __ sign_extend_short(rax); break;
2143 case T_INT : /* nothing to do */ break;
2144 case T_DOUBLE :
2145 case T_FLOAT :
2146 // Result is in xmm0 we'll save as needed
2147 break;
2148 case T_ARRAY: // Really a handle
2149 case T_OBJECT: // Really a handle
2150 break; // can't de-handlize until after safepoint check
2151 case T_VOID: break;
2152 case T_LONG: break;
2153 default : ShouldNotReachHere();
2154 }
2155
2156 Label after_transition;
2157
2158 // If this is a critical native, check for a safepoint or suspend request after the call.
2159 // If a safepoint is needed, transition to native, then to native_trans to handle
2160 // safepoints like the native methods that are not critical natives.
2161 if (is_critical_native) {
2162 Label needs_safepoint;
2163 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2164 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2165 __ jcc(Assembler::equal, after_transition);
2166 __ bind(needs_safepoint);
2167 }
2168
2169 // Switch thread to "native transition" state before reading the synchronization state.
2170 // This additional state is necessary because reading and testing the synchronization
2171 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2172 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2173 // VM thread changes sync state to synchronizing and suspends threads for GC.
2174 // Thread A is resumed to finish this native method, but doesn't block here since it
2175 // didn't see any synchronization is progress, and escapes.
2176 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2177
2178 // Force this write out before the read below
2179 __ membar(Assembler::Membar_mask_bits(
2180 Assembler::LoadLoad | Assembler::LoadStore |
2181 Assembler::StoreLoad | Assembler::StoreStore));
2182
2183 // check for safepoint operation in progress and/or pending suspend requests
2184 {
2185 Label Continue;
2186 Label slow_path;
2187
2188 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2189
2190 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2191 __ jcc(Assembler::equal, Continue);
2192 __ bind(slow_path);
2193
2194 // Don't use call_VM as it will see a possible pending exception and forward it
2195 // and never return here preventing us from clearing _last_native_pc down below.
2196 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2197 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2198 // by hand.
2199 //
2200 __ vzeroupper();
2201 save_native_result(masm, ret_type, stack_slots);
2202 __ mov(c_rarg0, r15_thread);
2203 __ mov(r12, rsp); // remember sp
2204 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2205 __ andptr(rsp, -16); // align stack as required by ABI
2206 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2207 __ mov(rsp, r12); // restore sp
2208 __ reinit_heapbase();
2209 // Restore any method result value
2210 restore_native_result(masm, ret_type, stack_slots);
2211 __ bind(Continue);
2212 }
2213
2214 // change thread state
2215 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2216 __ bind(after_transition);
2217
2218 Label reguard;
2219 Label reguard_done;
2220 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2221 __ jcc(Assembler::equal, reguard);
2222 __ bind(reguard_done);
2223
2224 // native result if any is live
2225
2226 // Unlock
2227 Label unlock_done;
2228 Label slow_path_unlock;
2229 if (method->is_synchronized()) {
2230
2231 // Get locked oop from the handle we passed to jni
2232 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2233
2234 Label done;
2235
2236 if (UseBiasedLocking) {
2237 __ biased_locking_exit(obj_reg, old_hdr, done);
2238 }
2239
2240 if (LockingMode == LM_LEGACY) {
2241 // Simple recursive lock?
2242
2243 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2244 __ jcc(Assembler::equal, done);
2245 }
2246
2247 // Must save rax if if it is live now because cmpxchg must use it
2248 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2249 save_native_result(masm, ret_type, stack_slots);
2250 }
2251
2252 if (LockingMode == LM_MONITOR) {
2253 __ jmp(slow_path_unlock);
2254 } else if (LockingMode == LM_LEGACY) {
2255 // get address of the stack lock
2256 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2257 // get old displaced header
2258 __ movptr(old_hdr, Address(rax, 0));
2259
2260 // Atomic swap old header if oop still contains the stack lock
2261 __ lock();
2262 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2263 __ jcc(Assembler::notEqual, slow_path_unlock);
2264 } else {
2265 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2266 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2267 }
2268
2269 // slow path re-enters here
2270 __ bind(unlock_done);
2271 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2272 restore_native_result(masm, ret_type, stack_slots);
2273 }
2274
2275 __ bind(done);
2276
2277 }
2278 {
2279 SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2280 save_native_result(masm, ret_type, stack_slots);
2281 __ mov_metadata(c_rarg1, method());
2282 __ call_VM_leaf(
2283 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2284 r15_thread, c_rarg1);
2285 restore_native_result(masm, ret_type, stack_slots);
2286 }
2287
2288 __ reset_last_Java_frame(false);
2289
2290 // Unbox oop result, e.g. JNIHandles::resolve value.
2291 if (is_reference_type(ret_type)) {
2292 __ resolve_jobject(rax /* value */,
2293 r15_thread /* thread */,
2294 rcx /* tmp */);
2295 }
2296
2297 if (CheckJNICalls) {
2298 // clear_pending_jni_exception_check
2299 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2300 }
2301
2302 if (!is_critical_native) {
2303 // reset handle block
2304 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2305 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2306 }
2307
2308 // pop our frame
2309
2310 __ leave();
2311
2312 if (!is_critical_native) {
2313 // Any exception pending?
2314 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2315 __ jcc(Assembler::notEqual, exception_pending);
2316 }
2317
2318 // Return
2319
2320 __ ret(0);
2321
2322 // Unexpected paths are out of line and go here
2323
2324 if (!is_critical_native) {
2325 // forward the exception
2326 __ bind(exception_pending);
2327
2328 // and forward the exception
2329 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2330 }
2331
2332 // Slow path locking & unlocking
2333 if (method->is_synchronized()) {
2334
2335 // BEGIN Slow path lock
2336 __ bind(slow_path_lock);
2337
2338 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2339 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2340
2341 // protect the args we've loaded
2342 save_args(masm, total_c_args, c_arg, out_regs);
2343
2344 __ mov(c_rarg0, obj_reg);
2345 __ mov(c_rarg1, lock_reg);
2346 __ mov(c_rarg2, r15_thread);
2347
2348 // Not a leaf but we have last_Java_frame setup as we want
2349 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2350 restore_args(masm, total_c_args, c_arg, out_regs);
2351
2352 #ifdef ASSERT
2353 { Label L;
2354 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2355 __ jcc(Assembler::equal, L);
2356 __ stop("no pending exception allowed on exit from monitorenter");
2357 __ bind(L);
2358 }
2359 #endif
2360 __ jmp(lock_done);
2361
2362 // END Slow path lock
2363
2364 // BEGIN Slow path unlock
2365 __ bind(slow_path_unlock);
2366
2367 // If we haven't already saved the native result we must save it now as xmm registers
2368 // are still exposed.
2369 __ vzeroupper();
2370 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2371 save_native_result(masm, ret_type, stack_slots);
2372 }
2373
2374 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2375
2376 __ mov(c_rarg0, obj_reg);
2377 __ mov(c_rarg2, r15_thread);
2378 __ mov(r12, rsp); // remember sp
2379 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2380 __ andptr(rsp, -16); // align stack as required by ABI
2381
2382 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2383 // NOTE that obj_reg == rbx currently
2384 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2385 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2386
2387 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2388 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2389 __ mov(rsp, r12); // restore sp
2390 __ reinit_heapbase();
2391 #ifdef ASSERT
2392 {
2393 Label L;
2394 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2395 __ jcc(Assembler::equal, L);
2396 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2397 __ bind(L);
2398 }
2399 #endif /* ASSERT */
2400
2401 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2402
2403 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2404 restore_native_result(masm, ret_type, stack_slots);
2405 }
2406 __ jmp(unlock_done);
2407
2408 // END Slow path unlock
2409
2410 } // synchronized
2411
2412 // SLOW PATH Reguard the stack if needed
2413
2414 __ bind(reguard);
2415 __ vzeroupper();
2416 save_native_result(masm, ret_type, stack_slots);
2417 __ mov(r12, rsp); // remember sp
2418 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2419 __ andptr(rsp, -16); // align stack as required by ABI
2420 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2421 __ mov(rsp, r12); // restore sp
2422 __ reinit_heapbase();
2423 restore_native_result(masm, ret_type, stack_slots);
2424 // and continue
2425 __ jmp(reguard_done);
2426
2427
2428
2429 __ flush();
2430
2431 nmethod *nm = nmethod::new_native_nmethod(method,
2432 compile_id,
2433 masm->code(),
2434 vep_offset,
2435 frame_complete,
2436 stack_slots / VMRegImpl::slots_per_word,
2437 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2438 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2439 oop_maps);
2440
2441 return nm;
2442 }
2443
2444 // this function returns the adjust size (in number of words) to a c2i adapter
2445 // activation for use during deoptimization
2446 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2447 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2448 }
2449
2450
2451 uint SharedRuntime::out_preserve_stack_slots() {
2452 return 0;
2453 }
2454
2455
2456 // Number of stack slots between incoming argument block and the start of
2457 // a new frame. The PROLOG must add this many slots to the stack. The
2458 // EPILOG must remove this many slots. amd64 needs two slots for
2459 // return address.
2460 uint SharedRuntime::in_preserve_stack_slots() {
2461 return 4 + 2 * VerifyStackAtCalls;
2462 }
2463
2464 //------------------------------generate_deopt_blob----------------------------
2465 void SharedRuntime::generate_deopt_blob() {
2466 // Allocate space for the code
2467 ResourceMark rm;
2468 // Setup code generation tools
2469 int pad = 0;
2470 if (UseAVX > 2) {
2471 pad += 1024;
2472 }
2473 #if INCLUDE_JVMCI
2474 if (EnableJVMCI) {
2475 pad += 512; // Increase the buffer size when compiling for JVMCI
2476 }
2477 #endif
2478 CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2479 MacroAssembler* masm = new MacroAssembler(&buffer);
2480 int frame_size_in_words;
2481 OopMap* map = NULL;
2482 OopMapSet *oop_maps = new OopMapSet();
2483
2484 // -------------
2485 // This code enters when returning to a de-optimized nmethod. A return
2486 // address has been pushed on the the stack, and return values are in
2487 // registers.
2488 // If we are doing a normal deopt then we were called from the patched
2489 // nmethod from the point we returned to the nmethod. So the return
2490 // address on the stack is wrong by NativeCall::instruction_size
2491 // We will adjust the value so it looks like we have the original return
2492 // address on the stack (like when we eagerly deoptimized).
2493 // In the case of an exception pending when deoptimizing, we enter
2494 // with a return address on the stack that points after the call we patched
2495 // into the exception handler. We have the following register state from,
2496 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2497 // rax: exception oop
2498 // rbx: exception handler
2499 // rdx: throwing pc
2500 // So in this case we simply jam rdx into the useless return address and
2501 // the stack looks just like we want.
2502 //
2503 // At this point we need to de-opt. We save the argument return
2504 // registers. We call the first C routine, fetch_unroll_info(). This
2505 // routine captures the return values and returns a structure which
2506 // describes the current frame size and the sizes of all replacement frames.
2507 // The current frame is compiled code and may contain many inlined
2508 // functions, each with their own JVM state. We pop the current frame, then
2509 // push all the new frames. Then we call the C routine unpack_frames() to
2510 // populate these frames. Finally unpack_frames() returns us the new target
2511 // address. Notice that callee-save registers are BLOWN here; they have
2512 // already been captured in the vframeArray at the time the return PC was
2513 // patched.
2514 address start = __ pc();
2515 Label cont;
2516
2517 // Prolog for non exception case!
2518
2519 // Save everything in sight.
2520 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2521
2522 // Normal deoptimization. Save exec mode for unpack_frames.
2523 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2524 __ jmp(cont);
2525
2526 int reexecute_offset = __ pc() - start;
2527 #if INCLUDE_JVMCI && !defined(COMPILER1)
2528 if (EnableJVMCI && UseJVMCICompiler) {
2529 // JVMCI does not use this kind of deoptimization
2530 __ should_not_reach_here();
2531 }
2532 #endif
2533
2534 // Reexecute case
2535 // return address is the pc describes what bci to do re-execute at
2536
2537 // No need to update map as each call to save_live_registers will produce identical oopmap
2538 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2539
2540 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2541 __ jmp(cont);
2542
2543 #if INCLUDE_JVMCI
2544 Label after_fetch_unroll_info_call;
2545 int implicit_exception_uncommon_trap_offset = 0;
2546 int uncommon_trap_offset = 0;
2547
2548 if (EnableJVMCI) {
2549 implicit_exception_uncommon_trap_offset = __ pc() - start;
2550
2551 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2552 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2553
2554 uncommon_trap_offset = __ pc() - start;
2555
2556 // Save everything in sight.
2557 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2558 // fetch_unroll_info needs to call last_java_frame()
2559 __ set_last_Java_frame(noreg, noreg, NULL);
2560
2561 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2562 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2563
2564 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2565 __ mov(c_rarg0, r15_thread);
2566 __ movl(c_rarg2, r14); // exec mode
2567 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2568 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2569
2570 __ reset_last_Java_frame(false);
2571
2572 __ jmp(after_fetch_unroll_info_call);
2573 } // EnableJVMCI
2574 #endif // INCLUDE_JVMCI
2575
2576 int exception_offset = __ pc() - start;
2577
2578 // Prolog for exception case
2579
2580 // all registers are dead at this entry point, except for rax, and
2581 // rdx which contain the exception oop and exception pc
2582 // respectively. Set them in TLS and fall thru to the
2583 // unpack_with_exception_in_tls entry point.
2584
2585 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2586 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2587
2588 int exception_in_tls_offset = __ pc() - start;
2589
2590 // new implementation because exception oop is now passed in JavaThread
2591
2592 // Prolog for exception case
2593 // All registers must be preserved because they might be used by LinearScan
2594 // Exceptiop oop and throwing PC are passed in JavaThread
2595 // tos: stack at point of call to method that threw the exception (i.e. only
2596 // args are on the stack, no return address)
2597
2598 // make room on stack for the return address
2599 // It will be patched later with the throwing pc. The correct value is not
2600 // available now because loading it from memory would destroy registers.
2601 __ push(0);
2602
2603 // Save everything in sight.
2604 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2605
2606 // Now it is safe to overwrite any register
2607
2608 // Deopt during an exception. Save exec mode for unpack_frames.
2609 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2610
2611 // load throwing pc from JavaThread and patch it as the return address
2612 // of the current frame. Then clear the field in JavaThread
2613
2614 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2615 __ movptr(Address(rbp, wordSize), rdx);
2616 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2617
2618 #ifdef ASSERT
2619 // verify that there is really an exception oop in JavaThread
2620 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2621 __ verify_oop(rax);
2622
2623 // verify that there is no pending exception
2624 Label no_pending_exception;
2625 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2626 __ testptr(rax, rax);
2627 __ jcc(Assembler::zero, no_pending_exception);
2628 __ stop("must not have pending exception here");
2629 __ bind(no_pending_exception);
2630 #endif
2631
2632 __ bind(cont);
2633
2634 // Call C code. Need thread and this frame, but NOT official VM entry
2635 // crud. We cannot block on this call, no GC can happen.
2636 //
2637 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2638
2639 // fetch_unroll_info needs to call last_java_frame().
2640
2641 __ set_last_Java_frame(noreg, noreg, NULL);
2642 #ifdef ASSERT
2643 { Label L;
2644 __ cmpptr(Address(r15_thread,
2645 JavaThread::last_Java_fp_offset()),
2646 (int32_t)0);
2647 __ jcc(Assembler::equal, L);
2648 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2649 __ bind(L);
2650 }
2651 #endif // ASSERT
2652 __ mov(c_rarg0, r15_thread);
2653 __ movl(c_rarg1, r14); // exec_mode
2654 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2655
2656 // Need to have an oopmap that tells fetch_unroll_info where to
2657 // find any register it might need.
2658 oop_maps->add_gc_map(__ pc() - start, map);
2659
2660 __ reset_last_Java_frame(false);
2661
2662 #if INCLUDE_JVMCI
2663 if (EnableJVMCI) {
2664 __ bind(after_fetch_unroll_info_call);
2665 }
2666 #endif
2667
2668 // Load UnrollBlock* into rdi
2669 __ mov(rdi, rax);
2670
2671 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2672 Label noException;
2673 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2674 __ jcc(Assembler::notEqual, noException);
2675 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2676 // QQQ this is useless it was NULL above
2677 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2678 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2679 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2680
2681 __ verify_oop(rax);
2682
2683 // Overwrite the result registers with the exception results.
2684 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2685 // I think this is useless
2686 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2687
2688 __ bind(noException);
2689
2690 // Only register save data is on the stack.
2691 // Now restore the result registers. Everything else is either dead
2692 // or captured in the vframeArray.
2693 RegisterSaver::restore_result_registers(masm);
2694
2695 // All of the register save area has been popped of the stack. Only the
2696 // return address remains.
2697
2698 // Pop all the frames we must move/replace.
2699 //
2700 // Frame picture (youngest to oldest)
2701 // 1: self-frame (no frame link)
2702 // 2: deopting frame (no frame link)
2703 // 3: caller of deopting frame (could be compiled/interpreted).
2704 //
2705 // Note: by leaving the return address of self-frame on the stack
2706 // and using the size of frame 2 to adjust the stack
2707 // when we are done the return to frame 3 will still be on the stack.
2708
2709 // Pop deoptimized frame
2710 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2711 __ addptr(rsp, rcx);
2712
2713 // rsp should be pointing at the return address to the caller (3)
2714
2715 // Pick up the initial fp we should save
2716 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2717 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2718
2719 #ifdef ASSERT
2720 // Compilers generate code that bang the stack by as much as the
2721 // interpreter would need. So this stack banging should never
2722 // trigger a fault. Verify that it does not on non product builds.
2723 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2724 __ bang_stack_size(rbx, rcx);
2725 #endif
2726
2727 // Load address of array of frame pcs into rcx
2728 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2729
2730 // Trash the old pc
2731 __ addptr(rsp, wordSize);
2732
2733 // Load address of array of frame sizes into rsi
2734 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2735
2736 // Load counter into rdx
2737 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2738
2739 // Now adjust the caller's stack to make up for the extra locals
2740 // but record the original sp so that we can save it in the skeletal interpreter
2741 // frame and the stack walking of interpreter_sender will get the unextended sp
2742 // value and not the "real" sp value.
2743
2744 const Register sender_sp = r8;
2745
2746 __ mov(sender_sp, rsp);
2747 __ movl(rbx, Address(rdi,
2748 Deoptimization::UnrollBlock::
2749 caller_adjustment_offset_in_bytes()));
2750 __ subptr(rsp, rbx);
2751
2752 // Push interpreter frames in a loop
2753 Label loop;
2754 __ bind(loop);
2755 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2756 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2757 __ pushptr(Address(rcx, 0)); // Save return address
2758 __ enter(); // Save old & set new ebp
2759 __ subptr(rsp, rbx); // Prolog
2760 // This value is corrected by layout_activation_impl
2761 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2762 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2763 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2764 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2765 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2766 __ decrementl(rdx); // Decrement counter
2767 __ jcc(Assembler::notZero, loop);
2768 __ pushptr(Address(rcx, 0)); // Save final return address
2769
2770 // Re-push self-frame
2771 __ enter(); // Save old & set new ebp
2772
2773 // Allocate a full sized register save area.
2774 // Return address and rbp are in place, so we allocate two less words.
2775 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2776
2777 // Restore frame locals after moving the frame
2778 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2779 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2780
2781 // Call C code. Need thread but NOT official VM entry
2782 // crud. We cannot block on this call, no GC can happen. Call should
2783 // restore return values to their stack-slots with the new SP.
2784 //
2785 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2786
2787 // Use rbp because the frames look interpreted now
2788 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2789 // Don't need the precise return PC here, just precise enough to point into this code blob.
2790 address the_pc = __ pc();
2791 __ set_last_Java_frame(noreg, rbp, the_pc);
2792
2793 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2794 __ mov(c_rarg0, r15_thread);
2795 __ movl(c_rarg1, r14); // second arg: exec_mode
2796 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2797 // Revert SP alignment after call since we're going to do some SP relative addressing below
2798 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2799
2800 // Set an oopmap for the call site
2801 // Use the same PC we used for the last java frame
2802 oop_maps->add_gc_map(the_pc - start,
2803 new OopMap( frame_size_in_words, 0 ));
2804
2805 // Clear fp AND pc
2806 __ reset_last_Java_frame(true);
2807
2808 // Collect return values
2809 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2810 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2811 // I think this is useless (throwing pc?)
2812 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2813
2814 // Pop self-frame.
2815 __ leave(); // Epilog
2816
2817 // Jump to interpreter
2818 __ ret(0);
2819
2820 // Make sure all code is generated
2821 masm->flush();
2822
2823 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2824 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2825 #if INCLUDE_JVMCI
2826 if (EnableJVMCI) {
2827 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2828 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2829 }
2830 #endif
2831 }
2832
2833 #ifdef COMPILER2
2834 //------------------------------generate_uncommon_trap_blob--------------------
2835 void SharedRuntime::generate_uncommon_trap_blob() {
2836 // Allocate space for the code
2837 ResourceMark rm;
2838 // Setup code generation tools
2839 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2840 MacroAssembler* masm = new MacroAssembler(&buffer);
2841
2842 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2843
2844 address start = __ pc();
2845
2846 if (UseRTMLocking) {
2847 // Abort RTM transaction before possible nmethod deoptimization.
2848 __ xabort(0);
2849 }
2850
2851 // Push self-frame. We get here with a return address on the
2852 // stack, so rsp is 8-byte aligned until we allocate our frame.
2853 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2854
2855 // No callee saved registers. rbp is assumed implicitly saved
2856 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2857
2858 // compiler left unloaded_class_index in j_rarg0 move to where the
2859 // runtime expects it.
2860 __ movl(c_rarg1, j_rarg0);
2861
2862 __ set_last_Java_frame(noreg, noreg, NULL);
2863
2864 // Call C code. Need thread but NOT official VM entry
2865 // crud. We cannot block on this call, no GC can happen. Call should
2866 // capture callee-saved registers as well as return values.
2867 // Thread is in rdi already.
2868 //
2869 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2870
2871 __ mov(c_rarg0, r15_thread);
2872 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2873 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2874
2875 // Set an oopmap for the call site
2876 OopMapSet* oop_maps = new OopMapSet();
2877 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2878
2879 // location of rbp is known implicitly by the frame sender code
2880
2881 oop_maps->add_gc_map(__ pc() - start, map);
2882
2883 __ reset_last_Java_frame(false);
2884
2885 // Load UnrollBlock* into rdi
2886 __ mov(rdi, rax);
2887
2888 #ifdef ASSERT
2889 { Label L;
2890 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2891 (int32_t)Deoptimization::Unpack_uncommon_trap);
2892 __ jcc(Assembler::equal, L);
2893 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2894 __ bind(L);
2895 }
2896 #endif
2897
2898 // Pop all the frames we must move/replace.
2899 //
2900 // Frame picture (youngest to oldest)
2901 // 1: self-frame (no frame link)
2902 // 2: deopting frame (no frame link)
2903 // 3: caller of deopting frame (could be compiled/interpreted).
2904
2905 // Pop self-frame. We have no frame, and must rely only on rax and rsp.
2906 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2907
2908 // Pop deoptimized frame (int)
2909 __ movl(rcx, Address(rdi,
2910 Deoptimization::UnrollBlock::
2911 size_of_deoptimized_frame_offset_in_bytes()));
2912 __ addptr(rsp, rcx);
2913
2914 // rsp should be pointing at the return address to the caller (3)
2915
2916 // Pick up the initial fp we should save
2917 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2918 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2919
2920 #ifdef ASSERT
2921 // Compilers generate code that bang the stack by as much as the
2922 // interpreter would need. So this stack banging should never
2923 // trigger a fault. Verify that it does not on non product builds.
2924 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2925 __ bang_stack_size(rbx, rcx);
2926 #endif
2927
2928 // Load address of array of frame pcs into rcx (address*)
2929 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2930
2931 // Trash the return pc
2932 __ addptr(rsp, wordSize);
2933
2934 // Load address of array of frame sizes into rsi (intptr_t*)
2935 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2936
2937 // Counter
2938 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2939
2940 // Now adjust the caller's stack to make up for the extra locals but
2941 // record the original sp so that we can save it in the skeletal
2942 // interpreter frame and the stack walking of interpreter_sender
2943 // will get the unextended sp value and not the "real" sp value.
2944
2945 const Register sender_sp = r8;
2946
2947 __ mov(sender_sp, rsp);
2948 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2949 __ subptr(rsp, rbx);
2950
2951 // Push interpreter frames in a loop
2952 Label loop;
2953 __ bind(loop);
2954 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2955 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand
2956 __ pushptr(Address(rcx, 0)); // Save return address
2957 __ enter(); // Save old & set new rbp
2958 __ subptr(rsp, rbx); // Prolog
2959 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2960 sender_sp); // Make it walkable
2961 // This value is corrected by layout_activation_impl
2962 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2963 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2964 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2965 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2966 __ decrementl(rdx); // Decrement counter
2967 __ jcc(Assembler::notZero, loop);
2968 __ pushptr(Address(rcx, 0)); // Save final return address
2969
2970 // Re-push self-frame
2971 __ enter(); // Save old & set new rbp
2972 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2973 // Prolog
2974
2975 // Use rbp because the frames look interpreted now
2976 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2977 // Don't need the precise return PC here, just precise enough to point into this code blob.
2978 address the_pc = __ pc();
2979 __ set_last_Java_frame(noreg, rbp, the_pc);
2980
2981 // Call C code. Need thread but NOT official VM entry
2982 // crud. We cannot block on this call, no GC can happen. Call should
2983 // restore return values to their stack-slots with the new SP.
2984 // Thread is in rdi already.
2985 //
2986 // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2987
2988 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2989 __ mov(c_rarg0, r15_thread);
2990 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2991 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2992
2993 // Set an oopmap for the call site
2994 // Use the same PC we used for the last java frame
2995 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2996
2997 // Clear fp AND pc
2998 __ reset_last_Java_frame(true);
2999
3000 // Pop self-frame.
3001 __ leave(); // Epilog
3002
3003 // Jump to interpreter
3004 __ ret(0);
3005
3006 // Make sure all code is generated
3007 masm->flush();
3008
3009 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps,
3010 SimpleRuntimeFrame::framesize >> 1);
3011 }
3012 #endif // COMPILER2
3013
3014 //------------------------------generate_handler_blob------
3015 //
3016 // Generate a special Compile2Runtime blob that saves all registers,
3017 // and setup oopmap.
3018 //
3019 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3020 assert(StubRoutines::forward_exception_entry() != NULL,
3021 "must be generated before");
3022
3023 ResourceMark rm;
3024 OopMapSet *oop_maps = new OopMapSet();
3025 OopMap* map;
3026
3027 // Allocate space for the code. Setup code generation tools.
3028 CodeBuffer buffer("handler_blob", 2048, 1024);
3029 MacroAssembler* masm = new MacroAssembler(&buffer);
3030
3031 address start = __ pc();
3032 address call_pc = NULL;
3033 int frame_size_in_words;
3034 bool cause_return = (poll_type == POLL_AT_RETURN);
3035 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3036
3037 if (UseRTMLocking) {
3038 // Abort RTM transaction before calling runtime
3039 // because critical section will be large and will be
3040 // aborted anyway. Also nmethod could be deoptimized.
3041 __ xabort(0);
3042 }
3043
3044 // Make room for return address (or push it again)
3045 if (!cause_return) {
3046 __ push(rbx);
3047 }
3048
3049 // Save registers, fpu state, and flags
3050 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3051
3052 // The following is basically a call_VM. However, we need the precise
3053 // address of the call in order to generate an oopmap. Hence, we do all the
3054 // work outselves.
3055
3056 __ set_last_Java_frame(noreg, noreg, NULL);
3057
3058 // The return address must always be correct so that frame constructor never
3059 // sees an invalid pc.
3060
3061 if (!cause_return) {
3062 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3063 // Additionally, rbx is a callee saved register and we can look at it later to determine
3064 // if someone changed the return address for us!
3065 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3066 __ movptr(Address(rbp, wordSize), rbx);
3067 }
3068
3069 // Do the call
3070 __ mov(c_rarg0, r15_thread);
3071 __ call(RuntimeAddress(call_ptr));
3072
3073 // Set an oopmap for the call site. This oopmap will map all
3074 // oop-registers and debug-info registers as callee-saved. This
3075 // will allow deoptimization at this safepoint to find all possible
3076 // debug-info recordings, as well as let GC find all oops.
3077
3078 oop_maps->add_gc_map( __ pc() - start, map);
3079
3080 Label noException;
3081
3082 __ reset_last_Java_frame(false);
3083
3084 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3085 __ jcc(Assembler::equal, noException);
3086
3087 // Exception pending
3088
3089 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3090
3091 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3092
3093 // No exception case
3094 __ bind(noException);
3095
3096 Label no_adjust;
3097 #ifdef ASSERT
3098 Label bail;
3099 #endif
3100 if (!cause_return) {
3101 Label no_prefix, not_special;
3102
3103 // If our stashed return pc was modified by the runtime we avoid touching it
3104 __ cmpptr(rbx, Address(rbp, wordSize));
3105 __ jccb(Assembler::notEqual, no_adjust);
3106
3107 // Skip over the poll instruction.
3108 // See NativeInstruction::is_safepoint_poll()
3109 // Possible encodings:
3110 // 85 00 test %eax,(%rax)
3111 // 85 01 test %eax,(%rcx)
3112 // 85 02 test %eax,(%rdx)
3113 // 85 03 test %eax,(%rbx)
3114 // 85 06 test %eax,(%rsi)
3115 // 85 07 test %eax,(%rdi)
3116 //
3117 // 41 85 00 test %eax,(%r8)
3118 // 41 85 01 test %eax,(%r9)
3119 // 41 85 02 test %eax,(%r10)
3120 // 41 85 03 test %eax,(%r11)
3121 // 41 85 06 test %eax,(%r14)
3122 // 41 85 07 test %eax,(%r15)
3123 //
3124 // 85 04 24 test %eax,(%rsp)
3125 // 41 85 04 24 test %eax,(%r12)
3126 // 85 45 00 test %eax,0x0(%rbp)
3127 // 41 85 45 00 test %eax,0x0(%r13)
3128
3129 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3130 __ jcc(Assembler::notEqual, no_prefix);
3131 __ addptr(rbx, 1);
3132 __ bind(no_prefix);
3133 #ifdef ASSERT
3134 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3135 #endif
3136 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3137 // r12/rsp 0x04
3138 // r13/rbp 0x05
3139 __ movzbq(rcx, Address(rbx, 1));
3140 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3141 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3142 __ cmpptr(rcx, 1);
3143 __ jcc(Assembler::above, not_special);
3144 __ addptr(rbx, 1);
3145 __ bind(not_special);
3146 #ifdef ASSERT
3147 // Verify the correct encoding of the poll we're about to skip.
3148 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3149 __ jcc(Assembler::notEqual, bail);
3150 // Mask out the modrm bits
3151 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3152 // rax encodes to 0, so if the bits are nonzero it's incorrect
3153 __ jcc(Assembler::notZero, bail);
3154 #endif
3155 // Adjust return pc forward to step over the safepoint poll instruction
3156 __ addptr(rbx, 2);
3157 __ movptr(Address(rbp, wordSize), rbx);
3158 }
3159
3160 __ bind(no_adjust);
3161 // Normal exit, restore registers and exit.
3162 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3163 __ ret(0);
3164
3165 #ifdef ASSERT
3166 __ bind(bail);
3167 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3168 #endif
3169
3170 // Make sure all code is generated
3171 masm->flush();
3172
3173 // Fill-out other meta info
3174 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3175 }
3176
3177 //
3178 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3179 //
3180 // Generate a stub that calls into vm to find out the proper destination
3181 // of a java call. All the argument registers are live at this point
3182 // but since this is generic code we don't know what they are and the caller
3183 // must do any gc of the args.
3184 //
3185 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3186 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3187
3188 // allocate space for the code
3189 ResourceMark rm;
3190
3191 CodeBuffer buffer(name, 1000, 512);
3192 MacroAssembler* masm = new MacroAssembler(&buffer);
3193
3194 int frame_size_in_words;
3195
3196 OopMapSet *oop_maps = new OopMapSet();
3197 OopMap* map = NULL;
3198
3199 int start = __ offset();
3200
3201 // No need to save vector registers since they are caller-saved anyway.
3202 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3203
3204 int frame_complete = __ offset();
3205
3206 __ set_last_Java_frame(noreg, noreg, NULL);
3207
3208 __ mov(c_rarg0, r15_thread);
3209
3210 __ call(RuntimeAddress(destination));
3211
3212
3213 // Set an oopmap for the call site.
3214 // We need this not only for callee-saved registers, but also for volatile
3215 // registers that the compiler might be keeping live across a safepoint.
3216
3217 oop_maps->add_gc_map( __ offset() - start, map);
3218
3219 // rax contains the address we are going to jump to assuming no exception got installed
3220
3221 // clear last_Java_sp
3222 __ reset_last_Java_frame(false);
3223 // check for pending exceptions
3224 Label pending;
3225 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3226 __ jcc(Assembler::notEqual, pending);
3227
3228 // get the returned Method*
3229 __ get_vm_result_2(rbx, r15_thread);
3230 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3231
3232 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3233
3234 RegisterSaver::restore_live_registers(masm);
3235
3236 // We are back the the original state on entry and ready to go.
3237
3238 __ jmp(rax);
3239
3240 // Pending exception after the safepoint
3241
3242 __ bind(pending);
3243
3244 RegisterSaver::restore_live_registers(masm);
3245
3246 // exception pending => remove activation and forward to exception handler
3247
3248 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3249
3250 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3251 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3252
3253 // -------------
3254 // make sure all code is generated
3255 masm->flush();
3256
3257 // return the blob
3258 // frame_size_words or bytes??
3259 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3260 }
3261
3262 #ifdef COMPILER2
3263 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3264
3265 class NativeInvokerGenerator : public StubCodeGenerator {
3266 address _call_target;
3267 int _shadow_space_bytes;
3268
3269 const GrowableArray<VMReg>& _input_registers;
3270 const GrowableArray<VMReg>& _output_registers;
3271
3272 int _frame_complete;
3273 int _framesize;
3274 OopMapSet* _oop_maps;
3275 public:
3276 NativeInvokerGenerator(CodeBuffer* buffer,
3277 address call_target,
3278 int shadow_space_bytes,
3279 const GrowableArray<VMReg>& input_registers,
3280 const GrowableArray<VMReg>& output_registers)
3281 : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3282 _call_target(call_target),
3283 _shadow_space_bytes(shadow_space_bytes),
3284 _input_registers(input_registers),
3285 _output_registers(output_registers),
3286 _frame_complete(0),
3287 _framesize(0),
3288 _oop_maps(NULL) {
3289 assert(_output_registers.length() <= 1
3290 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3291
3292 }
3293
3294 void generate();
3295
3296 int spill_size_in_bytes() const {
3297 if (_output_registers.length() == 0) {
3298 return 0;
3299 }
3300 VMReg reg = _output_registers.at(0);
3301 assert(reg->is_reg(), "must be a register");
3302 if (reg->is_Register()) {
3303 return 8;
3304 } else if (reg->is_XMMRegister()) {
3305 if (UseAVX >= 3) {
3306 return 64;
3307 } else if (UseAVX >= 1) {
3308 return 32;
3309 } else {
3310 return 16;
3311 }
3312 } else {
3313 ShouldNotReachHere();
3314 }
3315 return 0;
3316 }
3317
3318 void spill_out_registers() {
3319 if (_output_registers.length() == 0) {
3320 return;
3321 }
3322 VMReg reg = _output_registers.at(0);
3323 assert(reg->is_reg(), "must be a register");
3324 MacroAssembler* masm = _masm;
3325 if (reg->is_Register()) {
3326 __ movptr(Address(rsp, 0), reg->as_Register());
3327 } else if (reg->is_XMMRegister()) {
3328 if (UseAVX >= 3) {
3329 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3330 } else if (UseAVX >= 1) {
3331 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3332 } else {
3333 __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3334 }
3335 } else {
3336 ShouldNotReachHere();
3337 }
3338 }
3339
3340 void fill_out_registers() {
3341 if (_output_registers.length() == 0) {
3342 return;
3343 }
3344 VMReg reg = _output_registers.at(0);
3345 assert(reg->is_reg(), "must be a register");
3346 MacroAssembler* masm = _masm;
3347 if (reg->is_Register()) {
3348 __ movptr(reg->as_Register(), Address(rsp, 0));
3349 } else if (reg->is_XMMRegister()) {
3350 if (UseAVX >= 3) {
3351 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3352 } else if (UseAVX >= 1) {
3353 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3354 } else {
3355 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3356 }
3357 } else {
3358 ShouldNotReachHere();
3359 }
3360 }
3361
3362 int frame_complete() const {
3363 return _frame_complete;
3364 }
3365
3366 int framesize() const {
3367 return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3368 }
3369
3370 OopMapSet* oop_maps() const {
3371 return _oop_maps;
3372 }
3373
3374 private:
3375 #ifdef ASSERT
3376 bool target_uses_register(VMReg reg) {
3377 return _input_registers.contains(reg) || _output_registers.contains(reg);
3378 }
3379 #endif
3380 };
3381
3382 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3383 int shadow_space_bytes,
3384 const GrowableArray<VMReg>& input_registers,
3385 const GrowableArray<VMReg>& output_registers) {
3386 int locs_size = 64;
3387 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3388 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3389 g.generate();
3390 code.log_section_sizes("nep_invoker_blob");
3391
3392 RuntimeStub* stub =
3393 RuntimeStub::new_runtime_stub("nep_invoker_blob",
3394 &code,
3395 g.frame_complete(),
3396 g.framesize(),
3397 g.oop_maps(), false);
3398 return stub;
3399 }
3400
3401 void NativeInvokerGenerator::generate() {
3402 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3403
3404 enum layout {
3405 rbp_off,
3406 rbp_off2,
3407 return_off,
3408 return_off2,
3409 framesize // inclusive of return address
3410 };
3411
3412 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3413 assert(is_even(_framesize/2), "sp not 16-byte aligned");
3414
3415 _oop_maps = new OopMapSet();
3416 MacroAssembler* masm = _masm;
3417
3418 address start = __ pc();
3419
3420 __ enter();
3421
3422 // return address and rbp are already in place
3423 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3424
3425 _frame_complete = __ pc() - start;
3426
3427 address the_pc = __ pc();
3428
3429 __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3430 OopMap* map = new OopMap(_framesize, 0);
3431 _oop_maps->add_gc_map(the_pc - start, map);
3432
3433 // State transition
3434 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3435
3436 __ call(RuntimeAddress(_call_target));
3437
3438 __ restore_cpu_control_state_after_jni();
3439
3440 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3441
3442 // Force this write out before the read below
3443 __ membar(Assembler::Membar_mask_bits(
3444 Assembler::LoadLoad | Assembler::LoadStore |
3445 Assembler::StoreLoad | Assembler::StoreStore));
3446
3447 Label L_after_safepoint_poll;
3448 Label L_safepoint_poll_slow_path;
3449
3450 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3451 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3452 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3453
3454 __ bind(L_after_safepoint_poll);
3455
3456 // change thread state
3457 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3458
3459 __ block_comment("reguard stack check");
3460 Label L_reguard;
3461 Label L_after_reguard;
3462 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3463 __ jcc(Assembler::equal, L_reguard);
3464 __ bind(L_after_reguard);
3465
3466 __ reset_last_Java_frame(r15_thread, true);
3467
3468 __ leave(); // required for proper stackwalking of RuntimeStub frame
3469 __ ret(0);
3470
3471 //////////////////////////////////////////////////////////////////////////////
3472
3473 __ block_comment("{ L_safepoint_poll_slow_path");
3474 __ bind(L_safepoint_poll_slow_path);
3475 __ vzeroupper();
3476
3477 spill_out_registers();
3478
3479 __ mov(c_rarg0, r15_thread);
3480 __ mov(r12, rsp); // remember sp
3481 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3482 __ andptr(rsp, -16); // align stack as required by ABI
3483 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3484 __ mov(rsp, r12); // restore sp
3485 __ reinit_heapbase();
3486
3487 fill_out_registers();
3488
3489 __ jmp(L_after_safepoint_poll);
3490 __ block_comment("} L_safepoint_poll_slow_path");
3491
3492 //////////////////////////////////////////////////////////////////////////////
3493
3494 __ block_comment("{ L_reguard");
3495 __ bind(L_reguard);
3496 __ vzeroupper();
3497
3498 spill_out_registers();
3499
3500 __ mov(r12, rsp); // remember sp
3501 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3502 __ andptr(rsp, -16); // align stack as required by ABI
3503 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3504 __ mov(rsp, r12); // restore sp
3505 __ reinit_heapbase();
3506
3507 fill_out_registers();
3508
3509 __ jmp(L_after_reguard);
3510
3511 __ block_comment("} L_reguard");
3512
3513 //////////////////////////////////////////////////////////////////////////////
3514
3515 __ flush();
3516 }
3517 #endif // COMPILER2
3518
3519 //------------------------------Montgomery multiplication------------------------
3520 //
3521
3522 #ifndef _WINDOWS
3523
3524 // Subtract 0:b from carry:a. Return carry.
3525 static julong
3526 sub(julong a[], julong b[], julong carry, long len) {
3527 long long i = 0, cnt = len;
3528 julong tmp;
3529 asm volatile("clc; "
3530 "0: ; "
3531 "mov (%[b], %[i], 8), %[tmp]; "
3532 "sbb %[tmp], (%[a], %[i], 8); "
3533 "inc %[i]; dec %[cnt]; "
3534 "jne 0b; "
3535 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3536 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3537 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3538 : "memory");
3539 return tmp;
3540 }
3541
3542 // Multiply (unsigned) Long A by Long B, accumulating the double-
3543 // length result into the accumulator formed of T0, T1, and T2.
3544 #define MACC(A, B, T0, T1, T2) \
3545 do { \
3546 unsigned long hi, lo; \
3547 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3548 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3549 : "r"(A), "a"(B) : "cc"); \
3550 } while(0)
3551
3552 // As above, but add twice the double-length result into the
3553 // accumulator.
3554 #define MACC2(A, B, T0, T1, T2) \
3555 do { \
3556 unsigned long hi, lo; \
3557 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3558 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3559 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3560 : "r"(A), "a"(B) : "cc"); \
3561 } while(0)
3562
3563 #else //_WINDOWS
3564
3565 static julong
3566 sub(julong a[], julong b[], julong carry, long len) {
3567 long i;
3568 julong tmp;
3569 unsigned char c = 1;
3570 for (i = 0; i < len; i++) {
3571 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3572 a[i] = tmp;
3573 }
3574 c = _addcarry_u64(c, carry, ~0, &tmp);
3575 return tmp;
3576 }
3577
3578 // Multiply (unsigned) Long A by Long B, accumulating the double-
3579 // length result into the accumulator formed of T0, T1, and T2.
3580 #define MACC(A, B, T0, T1, T2) \
3581 do { \
3582 julong hi, lo; \
3583 lo = _umul128(A, B, &hi); \
3584 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3585 c = _addcarry_u64(c, hi, T1, &T1); \
3586 _addcarry_u64(c, T2, 0, &T2); \
3587 } while(0)
3588
3589 // As above, but add twice the double-length result into the
3590 // accumulator.
3591 #define MACC2(A, B, T0, T1, T2) \
3592 do { \
3593 julong hi, lo; \
3594 lo = _umul128(A, B, &hi); \
3595 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3596 c = _addcarry_u64(c, hi, T1, &T1); \
3597 _addcarry_u64(c, T2, 0, &T2); \
3598 c = _addcarry_u64(0, lo, T0, &T0); \
3599 c = _addcarry_u64(c, hi, T1, &T1); \
3600 _addcarry_u64(c, T2, 0, &T2); \
3601 } while(0)
3602
3603 #endif //_WINDOWS
3604
3605 // Fast Montgomery multiplication. The derivation of the algorithm is
3606 // in A Cryptographic Library for the Motorola DSP56000,
3607 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3608
3609 static void NOINLINE
3610 montgomery_multiply(julong a[], julong b[], julong n[],
3611 julong m[], julong inv, int len) {
3612 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3613 int i;
3614
3615 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3616
3617 for (i = 0; i < len; i++) {
3618 int j;
3619 for (j = 0; j < i; j++) {
3620 MACC(a[j], b[i-j], t0, t1, t2);
3621 MACC(m[j], n[i-j], t0, t1, t2);
3622 }
3623 MACC(a[i], b[0], t0, t1, t2);
3624 m[i] = t0 * inv;
3625 MACC(m[i], n[0], t0, t1, t2);
3626
3627 assert(t0 == 0, "broken Montgomery multiply");
3628
3629 t0 = t1; t1 = t2; t2 = 0;
3630 }
3631
3632 for (i = len; i < 2*len; i++) {
3633 int j;
3634 for (j = i-len+1; j < len; j++) {
3635 MACC(a[j], b[i-j], t0, t1, t2);
3636 MACC(m[j], n[i-j], t0, t1, t2);
3637 }
3638 m[i-len] = t0;
3639 t0 = t1; t1 = t2; t2 = 0;
3640 }
3641
3642 while (t0)
3643 t0 = sub(m, n, t0, len);
3644 }
3645
3646 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3647 // multiplies so it should be up to 25% faster than Montgomery
3648 // multiplication. However, its loop control is more complex and it
3649 // may actually run slower on some machines.
3650
3651 static void NOINLINE
3652 montgomery_square(julong a[], julong n[],
3653 julong m[], julong inv, int len) {
3654 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3655 int i;
3656
3657 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3658
3659 for (i = 0; i < len; i++) {
3660 int j;
3661 int end = (i+1)/2;
3662 for (j = 0; j < end; j++) {
3663 MACC2(a[j], a[i-j], t0, t1, t2);
3664 MACC(m[j], n[i-j], t0, t1, t2);
3665 }
3666 if ((i & 1) == 0) {
3667 MACC(a[j], a[j], t0, t1, t2);
3668 }
3669 for (; j < i; j++) {
3670 MACC(m[j], n[i-j], t0, t1, t2);
3671 }
3672 m[i] = t0 * inv;
3673 MACC(m[i], n[0], t0, t1, t2);
3674
3675 assert(t0 == 0, "broken Montgomery square");
3676
3677 t0 = t1; t1 = t2; t2 = 0;
3678 }
3679
3680 for (i = len; i < 2*len; i++) {
3681 int start = i-len+1;
3682 int end = start + (len - start)/2;
3683 int j;
3684 for (j = start; j < end; j++) {
3685 MACC2(a[j], a[i-j], t0, t1, t2);
3686 MACC(m[j], n[i-j], t0, t1, t2);
3687 }
3688 if ((i & 1) == 0) {
3689 MACC(a[j], a[j], t0, t1, t2);
3690 }
3691 for (; j < len; j++) {
3692 MACC(m[j], n[i-j], t0, t1, t2);
3693 }
3694 m[i-len] = t0;
3695 t0 = t1; t1 = t2; t2 = 0;
3696 }
3697
3698 while (t0)
3699 t0 = sub(m, n, t0, len);
3700 }
3701
3702 // Swap words in a longword.
3703 static julong swap(julong x) {
3704 return (x << 32) | (x >> 32);
3705 }
3706
3707 // Copy len longwords from s to d, word-swapping as we go. The
3708 // destination array is reversed.
3709 static void reverse_words(julong *s, julong *d, int len) {
3710 d += len;
3711 while(len-- > 0) {
3712 d--;
3713 *d = swap(*s);
3714 s++;
3715 }
3716 }
3717
3718 // The threshold at which squaring is advantageous was determined
3719 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3720 #define MONTGOMERY_SQUARING_THRESHOLD 64
3721
3722 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3723 jint len, jlong inv,
3724 jint *m_ints) {
3725 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3726 int longwords = len/2;
3727
3728 // Make very sure we don't use so much space that the stack might
3729 // overflow. 512 jints corresponds to an 16384-bit integer and
3730 // will use here a total of 8k bytes of stack space.
3731 int divisor = sizeof(julong) * 4;
3732 guarantee(longwords <= 8192 / divisor, "must be");
3733 int total_allocation = longwords * sizeof (julong) * 4;
3734 julong *scratch = (julong *)alloca(total_allocation);
3735
3736 // Local scratch arrays
3737 julong
3738 *a = scratch + 0 * longwords,
3739 *b = scratch + 1 * longwords,
3740 *n = scratch + 2 * longwords,
3741 *m = scratch + 3 * longwords;
3742
3743 reverse_words((julong *)a_ints, a, longwords);
3744 reverse_words((julong *)b_ints, b, longwords);
3745 reverse_words((julong *)n_ints, n, longwords);
3746
3747 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3748
3749 reverse_words(m, (julong *)m_ints, longwords);
3750 }
3751
3752 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3753 jint len, jlong inv,
3754 jint *m_ints) {
3755 assert(len % 2 == 0, "array length in montgomery_square must be even");
3756 int longwords = len/2;
3757
3758 // Make very sure we don't use so much space that the stack might
3759 // overflow. 512 jints corresponds to an 16384-bit integer and
3760 // will use here a total of 6k bytes of stack space.
3761 int divisor = sizeof(julong) * 3;
3762 guarantee(longwords <= (8192 / divisor), "must be");
3763 int total_allocation = longwords * sizeof (julong) * 3;
3764 julong *scratch = (julong *)alloca(total_allocation);
3765
3766 // Local scratch arrays
3767 julong
3768 *a = scratch + 0 * longwords,
3769 *n = scratch + 1 * longwords,
3770 *m = scratch + 2 * longwords;
3771
3772 reverse_words((julong *)a_ints, a, longwords);
3773 reverse_words((julong *)n_ints, n, longwords);
3774
3775 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3776 ::montgomery_square(a, n, m, (julong)inv, longwords);
3777 } else {
3778 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3779 }
3780
3781 reverse_words(m, (julong *)m_ints, longwords);
3782 }
3783
3784 #ifdef COMPILER2
3785 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3786 //
3787 //------------------------------generate_exception_blob---------------------------
3788 // creates exception blob at the end
3789 // Using exception blob, this code is jumped from a compiled method.
3790 // (see emit_exception_handler in x86_64.ad file)
3791 //
3792 // Given an exception pc at a call we call into the runtime for the
3793 // handler in this method. This handler might merely restore state
3794 // (i.e. callee save registers) unwind the frame and jump to the
3795 // exception handler for the nmethod if there is no Java level handler
3796 // for the nmethod.
3797 //
3798 // This code is entered with a jmp.
3799 //
3800 // Arguments:
3801 // rax: exception oop
3802 // rdx: exception pc
3803 //
3804 // Results:
3805 // rax: exception oop
3806 // rdx: exception pc in caller or ???
3807 // destination: exception handler of caller
3808 //
3809 // Note: the exception pc MUST be at a call (precise debug information)
3810 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3811 //
3812
3813 void OptoRuntime::generate_exception_blob() {
3814 assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3815 assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3816 assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3817
3818 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3819
3820 // Allocate space for the code
3821 ResourceMark rm;
3822 // Setup code generation tools
3823 CodeBuffer buffer("exception_blob", 2048, 1024);
3824 MacroAssembler* masm = new MacroAssembler(&buffer);
3825
3826
3827 address start = __ pc();
3828
3829 // Exception pc is 'return address' for stack walker
3830 __ push(rdx);
3831 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3832
3833 // Save callee-saved registers. See x86_64.ad.
3834
3835 // rbp is an implicitly saved callee saved register (i.e., the calling
3836 // convention will save/restore it in the prolog/epilog). Other than that
3837 // there are no callee save registers now that adapter frames are gone.
3838
3839 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3840
3841 // Store exception in Thread object. We cannot pass any arguments to the
3842 // handle_exception call, since we do not want to make any assumption
3843 // about the size of the frame where the exception happened in.
3844 // c_rarg0 is either rdi (Linux) or rcx (Windows).
3845 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3846 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3847
3848 // This call does all the hard work. It checks if an exception handler
3849 // exists in the method.
3850 // If so, it returns the handler address.
3851 // If not, it prepares for stack-unwinding, restoring the callee-save
3852 // registers of the frame being removed.
3853 //
3854 // address OptoRuntime::handle_exception_C(JavaThread* thread)
3855
3856 // At a method handle call, the stack may not be properly aligned
3857 // when returning with an exception.
3858 address the_pc = __ pc();
3859 __ set_last_Java_frame(noreg, noreg, the_pc);
3860 __ mov(c_rarg0, r15_thread);
3861 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3862 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3863
3864 // Set an oopmap for the call site. This oopmap will only be used if we
3865 // are unwinding the stack. Hence, all locations will be dead.
3866 // Callee-saved registers will be the same as the frame above (i.e.,
3867 // handle_exception_stub), since they were restored when we got the
3868 // exception.
3869
3870 OopMapSet* oop_maps = new OopMapSet();
3871
3872 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3873
3874 __ reset_last_Java_frame(false);
3875
3876 // Restore callee-saved registers
3877
3878 // rbp is an implicitly saved callee-saved register (i.e., the calling
3879 // convention will save restore it in prolog/epilog) Other than that
3880 // there are no callee save registers now that adapter frames are gone.
3881
3882 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3883
3884 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3885 __ pop(rdx); // No need for exception pc anymore
3886
3887 // rax: exception handler
3888
3889 // We have a handler in rax (could be deopt blob).
3890 __ mov(r8, rax);
3891
3892 // Get the exception oop
3893 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3894 // Get the exception pc in case we are deoptimized
3895 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3896 #ifdef ASSERT
3897 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3898 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3899 #endif
3900 // Clear the exception oop so GC no longer processes it as a root.
3901 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3902
3903 // rax: exception oop
3904 // r8: exception handler
3905 // rdx: exception pc
3906 // Jump to handler
3907
3908 __ jmp(r8);
3909
3910 // Make sure all code is generated
3911 masm->flush();
3912
3913 // Set exception blob
3914 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3915 }
3916 #endif // COMPILER2
3917
3918 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3919 int total_in_args, const VMRegPair* in_regs,
3920 int total_out_args, VMRegPair* out_regs,
3921 GrowableArray<int>& arg_order,
3922 VMRegPair tmp_vmreg) {
3923 ComputeMoveOrder order(total_in_args, in_regs,
3924 total_out_args, out_regs,
3925 in_sig_bt, arg_order, tmp_vmreg);
3926 }