1 /*
2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #ifndef _WINDOWS
27 #include "alloca.h"
28 #endif
29 #include "asm/macroAssembler.hpp"
30 #include "asm/macroAssembler.inline.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/icBuffer.hpp"
34 #include "code/nativeInst.hpp"
35 #include "code/vtableStubs.hpp"
36 #include "compiler/oopMap.hpp"
37 #include "gc/shared/collectedHeap.hpp"
38 #include "gc/shared/gcLocker.hpp"
39 #include "gc/shared/barrierSet.hpp"
40 #include "gc/shared/barrierSetAssembler.hpp"
41 #include "interpreter/interpreter.hpp"
42 #include "logging/log.hpp"
43 #include "memory/resourceArea.hpp"
44 #include "memory/universe.hpp"
45 #include "oops/compiledICHolder.hpp"
46 #include "oops/klass.inline.hpp"
47 #include "oops/method.inline.hpp"
48 #include "prims/methodHandles.hpp"
49 #include "runtime/continuation.hpp"
50 #include "runtime/continuationEntry.inline.hpp"
51 #include "runtime/globals.hpp"
52 #include "runtime/jniHandles.hpp"
53 #include "runtime/safepointMechanism.hpp"
54 #include "runtime/sharedRuntime.hpp"
55 #include "runtime/signature.hpp"
56 #include "runtime/stubRoutines.hpp"
57 #include "runtime/vframeArray.hpp"
58 #include "runtime/vm_version.hpp"
59 #include "utilities/align.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68 #if INCLUDE_JVMCI
69 #include "jvmci/jvmciJavaClasses.hpp"
70 #endif
71
72 #define __ masm->
73
74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
75
76 class SimpleRuntimeFrame {
77
78 public:
79
80 // Most of the runtime stubs have this simple frame layout.
81 // This class exists to make the layout shared in one place.
82 // Offsets are for compiler stack slots, which are jints.
83 enum layout {
84 // The frame sender code expects that rbp will be in the "natural" place and
85 // will override any oopMap setting for it. We must therefore force the layout
86 // so that it agrees with the frame sender code.
87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
88 rbp_off2,
89 return_off, return_off2,
90 framesize
91 };
92 };
93
94 class RegisterSaver {
95 // Capture info about frame layout. Layout offsets are in jint
96 // units because compiler frame slots are jints.
97 #define XSAVE_AREA_BEGIN 160
98 #define XSAVE_AREA_YMM_BEGIN 576
99 #define XSAVE_AREA_OPMASK_BEGIN 1088
100 #define XSAVE_AREA_ZMM_BEGIN 1152
101 #define XSAVE_AREA_UPPERBANK 1664
102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
107 enum layout {
108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
110 DEF_XMM_OFFS(0),
111 DEF_XMM_OFFS(1),
112 // 2..15 are implied in range usage
113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
114 DEF_YMM_OFFS(0),
115 DEF_YMM_OFFS(1),
116 // 2..15 are implied in range usage
117 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
118 DEF_OPMASK_OFFS(0),
119 DEF_OPMASK_OFFS(1),
120 // 2..7 are implied in range usage
121 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
122 DEF_ZMM_OFFS(0),
123 DEF_ZMM_OFFS(1),
124 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
125 DEF_ZMM_UPPER_OFFS(16),
126 DEF_ZMM_UPPER_OFFS(17),
127 // 18..31 are implied in range usage
128 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
129 fpu_stateH_end,
130 r15_off, r15H_off,
131 r14_off, r14H_off,
132 r13_off, r13H_off,
133 r12_off, r12H_off,
134 r11_off, r11H_off,
135 r10_off, r10H_off,
136 r9_off, r9H_off,
137 r8_off, r8H_off,
138 rdi_off, rdiH_off,
139 rsi_off, rsiH_off,
140 ignore_off, ignoreH_off, // extra copy of rbp
141 rsp_off, rspH_off,
142 rbx_off, rbxH_off,
143 rdx_off, rdxH_off,
144 rcx_off, rcxH_off,
145 rax_off, raxH_off,
146 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
147 align_off, alignH_off,
148 flags_off, flagsH_off,
149 // The frame sender code expects that rbp will be in the "natural" place and
150 // will override any oopMap setting for it. We must therefore force the layout
151 // so that it agrees with the frame sender code.
152 rbp_off, rbpH_off, // copy of rbp we will restore
153 return_off, returnH_off, // slot for return address
154 reg_save_size // size in compiler stack slots
155 };
156
157 public:
158 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
159 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
160
161 // Offsets into the register save area
162 // Used by deoptimization when it is managing result register
163 // values on its own
164
165 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
166 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
167 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
168 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
169 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
170
171 // During deoptimization only the result registers need to be restored,
172 // all the other values have already been extracted.
173 static void restore_result_registers(MacroAssembler* masm);
174 };
175
176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
177 int off = 0;
178 int num_xmm_regs = XMMRegister::available_xmm_registers();
179 #if COMPILER2_OR_JVMCI
180 if (save_wide_vectors && UseAVX == 0) {
181 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
182 }
183 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
184 #else
185 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
186 #endif
187
188 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
189 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
190 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
191 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
192 // CodeBlob frame size is in words.
193 int frame_size_in_words = frame_size_in_bytes / wordSize;
194 *total_frame_words = frame_size_in_words;
195
196 // Save registers, fpu state, and flags.
197 // We assume caller has already pushed the return address onto the
198 // stack, so rsp is 8-byte aligned here.
199 // We push rpb twice in this sequence because we want the real rbp
200 // to be under the return like a normal enter.
201
202 __ enter(); // rsp becomes 16-byte aligned here
203 __ push_CPU_state(); // Push a multiple of 16 bytes
204
205 // push cpu state handles this on EVEX enabled targets
206 if (save_wide_vectors) {
207 // Save upper half of YMM registers(0..15)
208 int base_addr = XSAVE_AREA_YMM_BEGIN;
209 for (int n = 0; n < 16; n++) {
210 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
211 }
212 if (VM_Version::supports_evex()) {
213 // Save upper half of ZMM registers(0..15)
214 base_addr = XSAVE_AREA_ZMM_BEGIN;
215 for (int n = 0; n < 16; n++) {
216 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
217 }
218 // Save full ZMM registers(16..num_xmm_regs)
219 base_addr = XSAVE_AREA_UPPERBANK;
220 off = 0;
221 int vector_len = Assembler::AVX_512bit;
222 for (int n = 16; n < num_xmm_regs; n++) {
223 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
224 }
225 #if COMPILER2_OR_JVMCI
226 base_addr = XSAVE_AREA_OPMASK_BEGIN;
227 off = 0;
228 for(int n = 0; n < KRegister::number_of_registers; n++) {
229 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
230 }
231 #endif
232 }
233 } else {
234 if (VM_Version::supports_evex()) {
235 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
236 int base_addr = XSAVE_AREA_UPPERBANK;
237 off = 0;
238 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
239 for (int n = 16; n < num_xmm_regs; n++) {
240 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
241 }
242 #if COMPILER2_OR_JVMCI
243 base_addr = XSAVE_AREA_OPMASK_BEGIN;
244 off = 0;
245 for(int n = 0; n < KRegister::number_of_registers; n++) {
246 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
247 }
248 #endif
249 }
250 }
251 __ vzeroupper();
252 if (frame::arg_reg_save_area_bytes != 0) {
253 // Allocate argument register save area
254 __ subptr(rsp, frame::arg_reg_save_area_bytes);
255 }
256
257 // Set an oopmap for the call site. This oopmap will map all
258 // oop-registers and debug-info registers as callee-saved. This
259 // will allow deoptimization at this safepoint to find all possible
260 // debug-info recordings, as well as let GC find all oops.
261
262 OopMapSet *oop_maps = new OopMapSet();
263 OopMap* map = new OopMap(frame_size_in_slots, 0);
264
265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
266
267 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
268 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
269 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
270 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
271 // rbp location is known implicitly by the frame sender code, needs no oopmap
272 // and the location where rbp was saved by is ignored
273 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
274 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
275 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
276 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
277 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
278 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
279 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
280 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
281 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
282 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
283 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
284 // on EVEX enabled targets, we get it included in the xsave area
285 off = xmm0_off;
286 int delta = xmm1_off - off;
287 for (int n = 0; n < 16; n++) {
288 XMMRegister xmm_name = as_XMMRegister(n);
289 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
290 off += delta;
291 }
292 if (UseAVX > 2) {
293 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
294 off = zmm16_off;
295 delta = zmm17_off - off;
296 for (int n = 16; n < num_xmm_regs; n++) {
297 XMMRegister zmm_name = as_XMMRegister(n);
298 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
299 off += delta;
300 }
301 }
302
303 #if COMPILER2_OR_JVMCI
304 if (save_wide_vectors) {
305 // Save upper half of YMM registers(0..15)
306 off = ymm0_off;
307 delta = ymm1_off - ymm0_off;
308 for (int n = 0; n < 16; n++) {
309 XMMRegister ymm_name = as_XMMRegister(n);
310 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
311 off += delta;
312 }
313 if (VM_Version::supports_evex()) {
314 // Save upper half of ZMM registers(0..15)
315 off = zmm0_off;
316 delta = zmm1_off - zmm0_off;
317 for (int n = 0; n < 16; n++) {
318 XMMRegister zmm_name = as_XMMRegister(n);
319 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
320 off += delta;
321 }
322 }
323 }
324 #endif // COMPILER2_OR_JVMCI
325
326 // %%% These should all be a waste but we'll keep things as they were for now
327 if (true) {
328 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
329 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
330 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
331 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
332 // rbp location is known implicitly by the frame sender code, needs no oopmap
333 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
334 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
335 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
336 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
337 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
338 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
339 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
340 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
341 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
342 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
343 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
344 // on EVEX enabled targets, we get it included in the xsave area
345 off = xmm0H_off;
346 delta = xmm1H_off - off;
347 for (int n = 0; n < 16; n++) {
348 XMMRegister xmm_name = as_XMMRegister(n);
349 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
350 off += delta;
351 }
352 if (UseAVX > 2) {
353 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
354 off = zmm16H_off;
355 delta = zmm17H_off - off;
356 for (int n = 16; n < num_xmm_regs; n++) {
357 XMMRegister zmm_name = as_XMMRegister(n);
358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
359 off += delta;
360 }
361 }
362 }
363
364 return map;
365 }
366
367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
368 int num_xmm_regs = XMMRegister::available_xmm_registers();
369 if (frame::arg_reg_save_area_bytes != 0) {
370 // Pop arg register save area
371 __ addptr(rsp, frame::arg_reg_save_area_bytes);
372 }
373
374 #if COMPILER2_OR_JVMCI
375 if (restore_wide_vectors) {
376 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
377 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
378 }
379 #else
380 assert(!restore_wide_vectors, "vectors are generated only by C2");
381 #endif
382
383 __ vzeroupper();
384
385 // On EVEX enabled targets everything is handled in pop fpu state
386 if (restore_wide_vectors) {
387 // Restore upper half of YMM registers (0..15)
388 int base_addr = XSAVE_AREA_YMM_BEGIN;
389 for (int n = 0; n < 16; n++) {
390 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
391 }
392 if (VM_Version::supports_evex()) {
393 // Restore upper half of ZMM registers (0..15)
394 base_addr = XSAVE_AREA_ZMM_BEGIN;
395 for (int n = 0; n < 16; n++) {
396 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
397 }
398 // Restore full ZMM registers(16..num_xmm_regs)
399 base_addr = XSAVE_AREA_UPPERBANK;
400 int vector_len = Assembler::AVX_512bit;
401 int off = 0;
402 for (int n = 16; n < num_xmm_regs; n++) {
403 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
404 }
405 #if COMPILER2_OR_JVMCI
406 base_addr = XSAVE_AREA_OPMASK_BEGIN;
407 off = 0;
408 for (int n = 0; n < KRegister::number_of_registers; n++) {
409 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
410 }
411 #endif
412 }
413 } else {
414 if (VM_Version::supports_evex()) {
415 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
416 int base_addr = XSAVE_AREA_UPPERBANK;
417 int off = 0;
418 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
419 for (int n = 16; n < num_xmm_regs; n++) {
420 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
421 }
422 #if COMPILER2_OR_JVMCI
423 base_addr = XSAVE_AREA_OPMASK_BEGIN;
424 off = 0;
425 for (int n = 0; n < KRegister::number_of_registers; n++) {
426 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
427 }
428 #endif
429 }
430 }
431
432 // Recover CPU state
433 __ pop_CPU_state();
434 // Get the rbp described implicitly by the calling convention (no oopMap)
435 __ pop(rbp);
436 }
437
438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
439
440 // Just restore result register. Only used by deoptimization. By
441 // now any callee save register that needs to be restored to a c2
442 // caller of the deoptee has been extracted into the vframeArray
443 // and will be stuffed into the c2i adapter we create for later
444 // restoration so only result registers need to be restored here.
445
446 // Restore fp result register
447 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
448 // Restore integer result register
449 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
450 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
451
452 // Pop all of the register save are off the stack except the return address
453 __ addptr(rsp, return_offset_in_bytes());
454 }
455
456 // Is vector's size (in bytes) bigger than a size saved by default?
457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
458 bool SharedRuntime::is_wide_vector(int size) {
459 return size > 16;
460 }
461
462 // ---------------------------------------------------------------------------
463 // Read the array of BasicTypes from a signature, and compute where the
464 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
465 // quantities. Values less than VMRegImpl::stack0 are registers, those above
466 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
467 // as framesizes are fixed.
468 // VMRegImpl::stack0 refers to the first slot 0(sp).
469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
470 // Register up to Register::number_of_registers are the 64-bit
471 // integer registers.
472
473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
474 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
475 // units regardless of build. Of course for i486 there is no 64 bit build
476
477 // The Java calling convention is a "shifted" version of the C ABI.
478 // By skipping the first C ABI register we can call non-static jni methods
479 // with small numbers of arguments without having to shuffle the arguments
480 // at all. Since we control the java ABI we ought to at least get some
481 // advantage out of it.
482
483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
484 VMRegPair *regs,
485 int total_args_passed) {
486
487 // Create the mapping between argument positions and
488 // registers.
489 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
490 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
491 };
492 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
493 j_farg0, j_farg1, j_farg2, j_farg3,
494 j_farg4, j_farg5, j_farg6, j_farg7
495 };
496
497
498 uint int_args = 0;
499 uint fp_args = 0;
500 uint stk_args = 0;
501
502 for (int i = 0; i < total_args_passed; i++) {
503 switch (sig_bt[i]) {
504 case T_BOOLEAN:
505 case T_CHAR:
506 case T_BYTE:
507 case T_SHORT:
508 case T_INT:
509 if (int_args < Argument::n_int_register_parameters_j) {
510 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
511 } else {
512 stk_args = align_up(stk_args, 2);
513 regs[i].set1(VMRegImpl::stack2reg(stk_args));
514 stk_args += 1;
515 }
516 break;
517 case T_VOID:
518 // halves of T_LONG or T_DOUBLE
519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
520 regs[i].set_bad();
521 break;
522 case T_LONG:
523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
524 // fall through
525 case T_OBJECT:
526 case T_ARRAY:
527 case T_ADDRESS:
528 if (int_args < Argument::n_int_register_parameters_j) {
529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
530 } else {
531 stk_args = align_up(stk_args, 2);
532 regs[i].set2(VMRegImpl::stack2reg(stk_args));
533 stk_args += 2;
534 }
535 break;
536 case T_FLOAT:
537 if (fp_args < Argument::n_float_register_parameters_j) {
538 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
539 } else {
540 stk_args = align_up(stk_args, 2);
541 regs[i].set1(VMRegImpl::stack2reg(stk_args));
542 stk_args += 1;
543 }
544 break;
545 case T_DOUBLE:
546 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
547 if (fp_args < Argument::n_float_register_parameters_j) {
548 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
549 } else {
550 stk_args = align_up(stk_args, 2);
551 regs[i].set2(VMRegImpl::stack2reg(stk_args));
552 stk_args += 2;
553 }
554 break;
555 default:
556 ShouldNotReachHere();
557 break;
558 }
559 }
560
561 return stk_args;
562 }
563
564 // Patch the callers callsite with entry to compiled code if it exists.
565 static void patch_callers_callsite(MacroAssembler *masm) {
566 Label L;
567 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
568 __ jcc(Assembler::equal, L);
569
570 // Save the current stack pointer
571 __ mov(r13, rsp);
572 // Schedule the branch target address early.
573 // Call into the VM to patch the caller, then jump to compiled callee
574 // rax isn't live so capture return address while we easily can
575 __ movptr(rax, Address(rsp, 0));
576
577 // align stack so push_CPU_state doesn't fault
578 __ andptr(rsp, -(StackAlignmentInBytes));
579 __ push_CPU_state();
580 __ vzeroupper();
581 // VM needs caller's callsite
582 // VM needs target method
583 // This needs to be a long call since we will relocate this adapter to
584 // the codeBuffer and it may not reach
585
586 // Allocate argument register save area
587 if (frame::arg_reg_save_area_bytes != 0) {
588 __ subptr(rsp, frame::arg_reg_save_area_bytes);
589 }
590 __ mov(c_rarg0, rbx);
591 __ mov(c_rarg1, rax);
592 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
593
594 // De-allocate argument register save area
595 if (frame::arg_reg_save_area_bytes != 0) {
596 __ addptr(rsp, frame::arg_reg_save_area_bytes);
597 }
598
599 __ vzeroupper();
600 __ pop_CPU_state();
601 // restore sp
602 __ mov(rsp, r13);
603 __ bind(L);
604 }
605
606
607 static void gen_c2i_adapter(MacroAssembler *masm,
608 int total_args_passed,
609 int comp_args_on_stack,
610 const BasicType *sig_bt,
611 const VMRegPair *regs,
612 Label& skip_fixup) {
613 // Before we get into the guts of the C2I adapter, see if we should be here
614 // at all. We've come from compiled code and are attempting to jump to the
615 // interpreter, which means the caller made a static call to get here
616 // (vcalls always get a compiled target if there is one). Check for a
617 // compiled target. If there is one, we need to patch the caller's call.
618 patch_callers_callsite(masm);
619
620 __ bind(skip_fixup);
621
622 // Since all args are passed on the stack, total_args_passed *
623 // Interpreter::stackElementSize is the space we need.
624
625 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
626
627 int extraspace = (total_args_passed * Interpreter::stackElementSize);
628
629 // stack is aligned, keep it that way
630 // This is not currently needed or enforced by the interpreter, but
631 // we might as well conform to the ABI.
632 extraspace = align_up(extraspace, 2*wordSize);
633
634 // set senderSP value
635 __ lea(r13, Address(rsp, wordSize));
636
637 #ifdef ASSERT
638 __ check_stack_alignment(r13, "sender stack not aligned");
639 #endif
640 if (extraspace > 0) {
641 // Pop the return address
642 __ pop(rax);
643
644 __ subptr(rsp, extraspace);
645
646 // Push the return address
647 __ push(rax);
648
649 // Account for the return address location since we store it first rather
650 // than hold it in a register across all the shuffling
651 extraspace += wordSize;
652 }
653
654 #ifdef ASSERT
655 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
656 #endif
657
658 // Now write the args into the outgoing interpreter space
659 for (int i = 0; i < total_args_passed; i++) {
660 if (sig_bt[i] == T_VOID) {
661 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
662 continue;
663 }
664
665 // offset to start parameters
666 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
667 int next_off = st_off - Interpreter::stackElementSize;
668
669 // Say 4 args:
670 // i st_off
671 // 0 32 T_LONG
672 // 1 24 T_VOID
673 // 2 16 T_OBJECT
674 // 3 8 T_BOOL
675 // - 0 return address
676 //
677 // However to make thing extra confusing. Because we can fit a long/double in
678 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
679 // leaves one slot empty and only stores to a single slot. In this case the
680 // slot that is occupied is the T_VOID slot. See I said it was confusing.
681
682 VMReg r_1 = regs[i].first();
683 VMReg r_2 = regs[i].second();
684 if (!r_1->is_valid()) {
685 assert(!r_2->is_valid(), "");
686 continue;
687 }
688 if (r_1->is_stack()) {
689 // memory to memory use rax
690 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
691 if (!r_2->is_valid()) {
692 // sign extend??
693 __ movl(rax, Address(rsp, ld_off));
694 __ movptr(Address(rsp, st_off), rax);
695
696 } else {
697
698 __ movq(rax, Address(rsp, ld_off));
699
700 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
701 // T_DOUBLE and T_LONG use two slots in the interpreter
702 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
703 // ld_off == LSW, ld_off+wordSize == MSW
704 // st_off == MSW, next_off == LSW
705 __ movq(Address(rsp, next_off), rax);
706 #ifdef ASSERT
707 // Overwrite the unused slot with known junk
708 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
709 __ movptr(Address(rsp, st_off), rax);
710 #endif /* ASSERT */
711 } else {
712 __ movq(Address(rsp, st_off), rax);
713 }
714 }
715 } else if (r_1->is_Register()) {
716 Register r = r_1->as_Register();
717 if (!r_2->is_valid()) {
718 // must be only an int (or less ) so move only 32bits to slot
719 // why not sign extend??
720 __ movl(Address(rsp, st_off), r);
721 } else {
722 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
723 // T_DOUBLE and T_LONG use two slots in the interpreter
724 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
725 // long/double in gpr
726 #ifdef ASSERT
727 // Overwrite the unused slot with known junk
728 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
729 __ movptr(Address(rsp, st_off), rax);
730 #endif /* ASSERT */
731 __ movq(Address(rsp, next_off), r);
732 } else {
733 __ movptr(Address(rsp, st_off), r);
734 }
735 }
736 } else {
737 assert(r_1->is_XMMRegister(), "");
738 if (!r_2->is_valid()) {
739 // only a float use just part of the slot
740 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
741 } else {
742 #ifdef ASSERT
743 // Overwrite the unused slot with known junk
744 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
745 __ movptr(Address(rsp, st_off), rax);
746 #endif /* ASSERT */
747 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
748 }
749 }
750 }
751
752 // Schedule the branch target address early.
753 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
754 __ jmp(rcx);
755 }
756
757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
758 address code_start, address code_end,
759 Label& L_ok) {
760 Label L_fail;
761 __ lea(temp_reg, ExternalAddress(code_start));
762 __ cmpptr(pc_reg, temp_reg);
763 __ jcc(Assembler::belowEqual, L_fail);
764 __ lea(temp_reg, ExternalAddress(code_end));
765 __ cmpptr(pc_reg, temp_reg);
766 __ jcc(Assembler::below, L_ok);
767 __ bind(L_fail);
768 }
769
770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
771 int total_args_passed,
772 int comp_args_on_stack,
773 const BasicType *sig_bt,
774 const VMRegPair *regs) {
775
776 // Note: r13 contains the senderSP on entry. We must preserve it since
777 // we may do a i2c -> c2i transition if we lose a race where compiled
778 // code goes non-entrant while we get args ready.
779 // In addition we use r13 to locate all the interpreter args as
780 // we must align the stack to 16 bytes on an i2c entry else we
781 // lose alignment we expect in all compiled code and register
782 // save code can segv when fxsave instructions find improperly
783 // aligned stack pointer.
784
785 // Adapters can be frameless because they do not require the caller
786 // to perform additional cleanup work, such as correcting the stack pointer.
787 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
788 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
789 // even if a callee has modified the stack pointer.
790 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
791 // routinely repairs its caller's stack pointer (from sender_sp, which is set
792 // up via the senderSP register).
793 // In other words, if *either* the caller or callee is interpreted, we can
794 // get the stack pointer repaired after a call.
795 // This is why c2i and i2c adapters cannot be indefinitely composed.
796 // In particular, if a c2i adapter were to somehow call an i2c adapter,
797 // both caller and callee would be compiled methods, and neither would
798 // clean up the stack pointer changes performed by the two adapters.
799 // If this happens, control eventually transfers back to the compiled
800 // caller, but with an uncorrected stack, causing delayed havoc.
801
802 if (VerifyAdapterCalls &&
803 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
804 // So, let's test for cascading c2i/i2c adapters right now.
805 // assert(Interpreter::contains($return_addr) ||
806 // StubRoutines::contains($return_addr),
807 // "i2c adapter must return to an interpreter frame");
808 __ block_comment("verify_i2c { ");
809 // Pick up the return address
810 __ movptr(rax, Address(rsp, 0));
811 Label L_ok;
812 if (Interpreter::code() != nullptr) {
813 range_check(masm, rax, r11,
814 Interpreter::code()->code_start(),
815 Interpreter::code()->code_end(),
816 L_ok);
817 }
818 if (StubRoutines::initial_stubs_code() != nullptr) {
819 range_check(masm, rax, r11,
820 StubRoutines::initial_stubs_code()->code_begin(),
821 StubRoutines::initial_stubs_code()->code_end(),
822 L_ok);
823 }
824 if (StubRoutines::final_stubs_code() != nullptr) {
825 range_check(masm, rax, r11,
826 StubRoutines::final_stubs_code()->code_begin(),
827 StubRoutines::final_stubs_code()->code_end(),
828 L_ok);
829 }
830 const char* msg = "i2c adapter must return to an interpreter frame";
831 __ block_comment(msg);
832 __ stop(msg);
833 __ bind(L_ok);
834 __ block_comment("} verify_i2ce ");
835 }
836
837 // Must preserve original SP for loading incoming arguments because
838 // we need to align the outgoing SP for compiled code.
839 __ movptr(r11, rsp);
840
841 // Pick up the return address
842 __ pop(rax);
843
844 // Convert 4-byte c2 stack slots to words.
845 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
846
847 if (comp_args_on_stack) {
848 __ subptr(rsp, comp_words_on_stack * wordSize);
849 }
850
851 // Ensure compiled code always sees stack at proper alignment
852 __ andptr(rsp, -16);
853
854 // push the return address and misalign the stack that youngest frame always sees
855 // as far as the placement of the call instruction
856 __ push(rax);
857
858 // Put saved SP in another register
859 const Register saved_sp = rax;
860 __ movptr(saved_sp, r11);
861
862 // Will jump to the compiled code just as if compiled code was doing it.
863 // Pre-load the register-jump target early, to schedule it better.
864 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
865
866 #if INCLUDE_JVMCI
867 if (EnableJVMCI) {
868 // check if this call should be routed towards a specific entry point
869 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
870 Label no_alternative_target;
871 __ jcc(Assembler::equal, no_alternative_target);
872 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
873 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
874 __ bind(no_alternative_target);
875 }
876 #endif // INCLUDE_JVMCI
877
878 // Now generate the shuffle code. Pick up all register args and move the
879 // rest through the floating point stack top.
880 for (int i = 0; i < total_args_passed; i++) {
881 if (sig_bt[i] == T_VOID) {
882 // Longs and doubles are passed in native word order, but misaligned
883 // in the 32-bit build.
884 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
885 continue;
886 }
887
888 // Pick up 0, 1 or 2 words from SP+offset.
889
890 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
891 "scrambled load targets?");
892 // Load in argument order going down.
893 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
894 // Point to interpreter value (vs. tag)
895 int next_off = ld_off - Interpreter::stackElementSize;
896 //
897 //
898 //
899 VMReg r_1 = regs[i].first();
900 VMReg r_2 = regs[i].second();
901 if (!r_1->is_valid()) {
902 assert(!r_2->is_valid(), "");
903 continue;
904 }
905 if (r_1->is_stack()) {
906 // Convert stack slot to an SP offset (+ wordSize to account for return address )
907 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
908
909 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
910 // and if we end up going thru a c2i because of a miss a reasonable value of r13
911 // will be generated.
912 if (!r_2->is_valid()) {
913 // sign extend???
914 __ movl(r13, Address(saved_sp, ld_off));
915 __ movptr(Address(rsp, st_off), r13);
916 } else {
917 //
918 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
919 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
920 // So we must adjust where to pick up the data to match the interpreter.
921 //
922 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
923 // are accessed as negative so LSW is at LOW address
924
925 // ld_off is MSW so get LSW
926 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
927 next_off : ld_off;
928 __ movq(r13, Address(saved_sp, offset));
929 // st_off is LSW (i.e. reg.first())
930 __ movq(Address(rsp, st_off), r13);
931 }
932 } else if (r_1->is_Register()) { // Register argument
933 Register r = r_1->as_Register();
934 assert(r != rax, "must be different");
935 if (r_2->is_valid()) {
936 //
937 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
938 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
939 // So we must adjust where to pick up the data to match the interpreter.
940
941 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
942 next_off : ld_off;
943
944 // this can be a misaligned move
945 __ movq(r, Address(saved_sp, offset));
946 } else {
947 // sign extend and use a full word?
948 __ movl(r, Address(saved_sp, ld_off));
949 }
950 } else {
951 if (!r_2->is_valid()) {
952 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
953 } else {
954 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
955 }
956 }
957 }
958
959 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
960
961 // 6243940 We might end up in handle_wrong_method if
962 // the callee is deoptimized as we race thru here. If that
963 // happens we don't want to take a safepoint because the
964 // caller frame will look interpreted and arguments are now
965 // "compiled" so it is much better to make this transition
966 // invisible to the stack walking code. Unfortunately if
967 // we try and find the callee by normal means a safepoint
968 // is possible. So we stash the desired callee in the thread
969 // and the vm will find there should this case occur.
970
971 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
972
973 // put Method* where a c2i would expect should we end up there
974 // only needed because eof c2 resolve stubs return Method* as a result in
975 // rax
976 __ mov(rax, rbx);
977 __ jmp(r11);
978 }
979
980 // ---------------------------------------------------------------
981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
982 int total_args_passed,
983 int comp_args_on_stack,
984 const BasicType *sig_bt,
985 const VMRegPair *regs,
986 AdapterFingerPrint* fingerprint) {
987 address i2c_entry = __ pc();
988
989 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
990
991 // -------------------------------------------------------------------------
992 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
993 // to the interpreter. The args start out packed in the compiled layout. They
994 // need to be unpacked into the interpreter layout. This will almost always
995 // require some stack space. We grow the current (compiled) stack, then repack
996 // the args. We finally end in a jump to the generic interpreter entry point.
997 // On exit from the interpreter, the interpreter will restore our SP (lest the
998 // compiled code, which relies solely on SP and not RBP, get sick).
999
1000 address c2i_unverified_entry = __ pc();
1001 Label skip_fixup;
1002 Label ok;
1003
1004 Register holder = rax;
1005 Register receiver = j_rarg0;
1006 Register temp = rbx;
1007
1008 {
1009 __ load_klass(temp, receiver, rscratch1);
1010 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1011 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1012 __ jcc(Assembler::equal, ok);
1013 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1014
1015 __ bind(ok);
1016 // Method might have been compiled since the call site was patched to
1017 // interpreted if that is the case treat it as a miss so we can get
1018 // the call site corrected.
1019 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1020 __ jcc(Assembler::equal, skip_fixup);
1021 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1022 }
1023
1024 address c2i_entry = __ pc();
1025
1026 // Class initialization barrier for static methods
1027 address c2i_no_clinit_check_entry = nullptr;
1028 if (VM_Version::supports_fast_class_init_checks()) {
1029 Label L_skip_barrier;
1030 Register method = rbx;
1031
1032 { // Bypass the barrier for non-static methods
1033 Register flags = rscratch1;
1034 __ movl(flags, Address(method, Method::access_flags_offset()));
1035 __ testl(flags, JVM_ACC_STATIC);
1036 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1037 }
1038
1039 Register klass = rscratch1;
1040 __ load_method_holder(klass, method);
1041 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1042
1043 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1044
1045 __ bind(L_skip_barrier);
1046 c2i_no_clinit_check_entry = __ pc();
1047 }
1048
1049 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1050 bs->c2i_entry_barrier(masm);
1051
1052 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1053
1054 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1055 }
1056
1057 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1058 VMRegPair *regs,
1059 VMRegPair *regs2,
1060 int total_args_passed) {
1061 assert(regs2 == nullptr, "not needed on x86");
1062 // We return the amount of VMRegImpl stack slots we need to reserve for all
1063 // the arguments NOT counting out_preserve_stack_slots.
1064
1065 // NOTE: These arrays will have to change when c1 is ported
1066 #ifdef _WIN64
1067 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1068 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1069 };
1070 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1071 c_farg0, c_farg1, c_farg2, c_farg3
1072 };
1073 #else
1074 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1075 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1076 };
1077 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1078 c_farg0, c_farg1, c_farg2, c_farg3,
1079 c_farg4, c_farg5, c_farg6, c_farg7
1080 };
1081 #endif // _WIN64
1082
1083
1084 uint int_args = 0;
1085 uint fp_args = 0;
1086 uint stk_args = 0; // inc by 2 each time
1087
1088 for (int i = 0; i < total_args_passed; i++) {
1089 switch (sig_bt[i]) {
1090 case T_BOOLEAN:
1091 case T_CHAR:
1092 case T_BYTE:
1093 case T_SHORT:
1094 case T_INT:
1095 if (int_args < Argument::n_int_register_parameters_c) {
1096 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098 fp_args++;
1099 // Allocate slots for callee to stuff register args the stack.
1100 stk_args += 2;
1101 #endif
1102 } else {
1103 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1104 stk_args += 2;
1105 }
1106 break;
1107 case T_LONG:
1108 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1109 // fall through
1110 case T_OBJECT:
1111 case T_ARRAY:
1112 case T_ADDRESS:
1113 case T_METADATA:
1114 if (int_args < Argument::n_int_register_parameters_c) {
1115 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1116 #ifdef _WIN64
1117 fp_args++;
1118 stk_args += 2;
1119 #endif
1120 } else {
1121 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1122 stk_args += 2;
1123 }
1124 break;
1125 case T_FLOAT:
1126 if (fp_args < Argument::n_float_register_parameters_c) {
1127 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1128 #ifdef _WIN64
1129 int_args++;
1130 // Allocate slots for callee to stuff register args the stack.
1131 stk_args += 2;
1132 #endif
1133 } else {
1134 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1135 stk_args += 2;
1136 }
1137 break;
1138 case T_DOUBLE:
1139 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1140 if (fp_args < Argument::n_float_register_parameters_c) {
1141 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1142 #ifdef _WIN64
1143 int_args++;
1144 // Allocate slots for callee to stuff register args the stack.
1145 stk_args += 2;
1146 #endif
1147 } else {
1148 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1149 stk_args += 2;
1150 }
1151 break;
1152 case T_VOID: // Halves of longs and doubles
1153 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1154 regs[i].set_bad();
1155 break;
1156 default:
1157 ShouldNotReachHere();
1158 break;
1159 }
1160 }
1161 #ifdef _WIN64
1162 // windows abi requires that we always allocate enough stack space
1163 // for 4 64bit registers to be stored down.
1164 if (stk_args < 8) {
1165 stk_args = 8;
1166 }
1167 #endif // _WIN64
1168
1169 return stk_args;
1170 }
1171
1172 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1173 uint num_bits,
1174 uint total_args_passed) {
1175 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1176 "only certain vector sizes are supported for now");
1177
1178 static const XMMRegister VEC_ArgReg[32] = {
1179 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1180 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1181 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1182 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1183 };
1184
1185 uint stk_args = 0;
1186 uint fp_args = 0;
1187
1188 for (uint i = 0; i < total_args_passed; i++) {
1189 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1190 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1191 regs[i].set_pair(vmreg->next(next_val), vmreg);
1192 }
1193
1194 return stk_args;
1195 }
1196
1197 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1198 // We always ignore the frame_slots arg and just use the space just below frame pointer
1199 // which by this time is free to use
1200 switch (ret_type) {
1201 case T_FLOAT:
1202 __ movflt(Address(rbp, -wordSize), xmm0);
1203 break;
1204 case T_DOUBLE:
1205 __ movdbl(Address(rbp, -wordSize), xmm0);
1206 break;
1207 case T_VOID: break;
1208 default: {
1209 __ movptr(Address(rbp, -wordSize), rax);
1210 }
1211 }
1212 }
1213
1214 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1215 // We always ignore the frame_slots arg and just use the space just below frame pointer
1216 // which by this time is free to use
1217 switch (ret_type) {
1218 case T_FLOAT:
1219 __ movflt(xmm0, Address(rbp, -wordSize));
1220 break;
1221 case T_DOUBLE:
1222 __ movdbl(xmm0, Address(rbp, -wordSize));
1223 break;
1224 case T_VOID: break;
1225 default: {
1226 __ movptr(rax, Address(rbp, -wordSize));
1227 }
1228 }
1229 }
1230
1231 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1232 for ( int i = first_arg ; i < arg_count ; i++ ) {
1233 if (args[i].first()->is_Register()) {
1234 __ push(args[i].first()->as_Register());
1235 } else if (args[i].first()->is_XMMRegister()) {
1236 __ subptr(rsp, 2*wordSize);
1237 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1238 }
1239 }
1240 }
1241
1242 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1243 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1244 if (args[i].first()->is_Register()) {
1245 __ pop(args[i].first()->as_Register());
1246 } else if (args[i].first()->is_XMMRegister()) {
1247 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1248 __ addptr(rsp, 2*wordSize);
1249 }
1250 }
1251 }
1252
1253 static void verify_oop_args(MacroAssembler* masm,
1254 const methodHandle& method,
1255 const BasicType* sig_bt,
1256 const VMRegPair* regs) {
1257 Register temp_reg = rbx; // not part of any compiled calling seq
1258 if (VerifyOops) {
1259 for (int i = 0; i < method->size_of_parameters(); i++) {
1260 if (is_reference_type(sig_bt[i])) {
1261 VMReg r = regs[i].first();
1262 assert(r->is_valid(), "bad oop arg");
1263 if (r->is_stack()) {
1264 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1265 __ verify_oop(temp_reg);
1266 } else {
1267 __ verify_oop(r->as_Register());
1268 }
1269 }
1270 }
1271 }
1272 }
1273
1274 static void check_continuation_enter_argument(VMReg actual_vmreg,
1275 Register expected_reg,
1276 const char* name) {
1277 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1278 assert(actual_vmreg->as_Register() == expected_reg,
1279 "%s is in unexpected register: %s instead of %s",
1280 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1281 }
1282
1283
1284 //---------------------------- continuation_enter_setup ---------------------------
1285 //
1286 // Arguments:
1287 // None.
1288 //
1289 // Results:
1290 // rsp: pointer to blank ContinuationEntry
1291 //
1292 // Kills:
1293 // rax
1294 //
1295 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1296 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1297 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1298 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1299
1300 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1301 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1302
1303 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1304 OopMap* map = new OopMap(frame_size, 0);
1305
1306 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1307 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1308 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1309
1310 return map;
1311 }
1312
1313 //---------------------------- fill_continuation_entry ---------------------------
1314 //
1315 // Arguments:
1316 // rsp: pointer to blank Continuation entry
1317 // reg_cont_obj: pointer to the continuation
1318 // reg_flags: flags
1319 //
1320 // Results:
1321 // rsp: pointer to filled out ContinuationEntry
1322 //
1323 // Kills:
1324 // rax
1325 //
1326 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1327 assert_different_registers(rax, reg_cont_obj, reg_flags);
1328 #ifdef ASSERT
1329 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1330 #endif
1331 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1332 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1333 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1334 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1335 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1336
1337 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1338 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1339 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1340 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1341
1342 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1343 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1344 }
1345
1346 //---------------------------- continuation_enter_cleanup ---------------------------
1347 //
1348 // Arguments:
1349 // rsp: pointer to the ContinuationEntry
1350 //
1351 // Results:
1352 // rsp: pointer to the spilled rbp in the entry frame
1353 //
1354 // Kills:
1355 // rbx
1356 //
1357 void static continuation_enter_cleanup(MacroAssembler* masm) {
1358 #ifdef ASSERT
1359 Label L_good_sp;
1360 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1361 __ jcc(Assembler::equal, L_good_sp);
1362 __ stop("Incorrect rsp at continuation_enter_cleanup");
1363 __ bind(L_good_sp);
1364 #endif
1365
1366 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1367 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1368 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1369 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1370
1371 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1372 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1373 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1374 }
1375
1376 static void gen_continuation_enter(MacroAssembler* masm,
1377 const VMRegPair* regs,
1378 int& exception_offset,
1379 OopMapSet* oop_maps,
1380 int& frame_complete,
1381 int& stack_slots,
1382 int& interpreted_entry_offset,
1383 int& compiled_entry_offset) {
1384
1385 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1386 int pos_cont_obj = 0;
1387 int pos_is_cont = 1;
1388 int pos_is_virtual = 2;
1389
1390 // The platform-specific calling convention may present the arguments in various registers.
1391 // To simplify the rest of the code, we expect the arguments to reside at these known
1392 // registers, and we additionally check the placement here in case calling convention ever
1393 // changes.
1394 Register reg_cont_obj = c_rarg1;
1395 Register reg_is_cont = c_rarg2;
1396 Register reg_is_virtual = c_rarg3;
1397
1398 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1399 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1400 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1401
1402 // Utility methods kill rax, make sure there are no collisions
1403 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1404
1405 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1406 relocInfo::static_call_type);
1407
1408 address start = __ pc();
1409
1410 Label L_thaw, L_exit;
1411
1412 // i2i entry used at interp_only_mode only
1413 interpreted_entry_offset = __ pc() - start;
1414 {
1415 #ifdef ASSERT
1416 Label is_interp_only;
1417 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1418 __ jcc(Assembler::notEqual, is_interp_only);
1419 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1420 __ bind(is_interp_only);
1421 #endif
1422
1423 __ pop(rax); // return address
1424 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1425 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1426 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1427 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1428 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1429 __ push(rax); // return address
1430 __ push_cont_fastpath();
1431
1432 __ enter();
1433
1434 stack_slots = 2; // will be adjusted in setup
1435 OopMap* map = continuation_enter_setup(masm, stack_slots);
1436 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1437 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1438
1439 __ verify_oop(reg_cont_obj);
1440
1441 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1442
1443 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1444 __ testptr(reg_is_cont, reg_is_cont);
1445 __ jcc(Assembler::notZero, L_thaw);
1446
1447 // --- Resolve path
1448
1449 // Make sure the call is patchable
1450 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1451 // Emit stub for static call
1452 CodeBuffer* cbuf = masm->code_section()->outer();
1453 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1454 if (stub == nullptr) {
1455 fatal("CodeCache is full at gen_continuation_enter");
1456 }
1457 __ call(resolve);
1458 oop_maps->add_gc_map(__ pc() - start, map);
1459 __ post_call_nop();
1460
1461 __ jmp(L_exit);
1462 }
1463
1464 // compiled entry
1465 __ align(CodeEntryAlignment);
1466 compiled_entry_offset = __ pc() - start;
1467 __ enter();
1468
1469 stack_slots = 2; // will be adjusted in setup
1470 OopMap* map = continuation_enter_setup(masm, stack_slots);
1471
1472 // Frame is now completed as far as size and linkage.
1473 frame_complete = __ pc() - start;
1474
1475 __ verify_oop(reg_cont_obj);
1476
1477 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1478
1479 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1480 __ testptr(reg_is_cont, reg_is_cont);
1481 __ jccb(Assembler::notZero, L_thaw);
1482
1483 // --- call Continuation.enter(Continuation c, boolean isContinue)
1484
1485 // Make sure the call is patchable
1486 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1487
1488 // Emit stub for static call
1489 CodeBuffer* cbuf = masm->code_section()->outer();
1490 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1491 if (stub == nullptr) {
1492 fatal("CodeCache is full at gen_continuation_enter");
1493 }
1494
1495 // The call needs to be resolved. There's a special case for this in
1496 // SharedRuntime::find_callee_info_helper() which calls
1497 // LinkResolver::resolve_continuation_enter() which resolves the call to
1498 // Continuation.enter(Continuation c, boolean isContinue).
1499 __ call(resolve);
1500
1501 oop_maps->add_gc_map(__ pc() - start, map);
1502 __ post_call_nop();
1503
1504 __ jmpb(L_exit);
1505
1506 // --- Thawing path
1507
1508 __ bind(L_thaw);
1509
1510 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1511
1512 ContinuationEntry::_return_pc_offset = __ pc() - start;
1513 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1514 __ post_call_nop();
1515
1516 // --- Normal exit (resolve/thawing)
1517
1518 __ bind(L_exit);
1519
1520 continuation_enter_cleanup(masm);
1521 __ pop(rbp);
1522 __ ret(0);
1523
1524 // --- Exception handling path
1525
1526 exception_offset = __ pc() - start;
1527
1528 continuation_enter_cleanup(masm);
1529 __ pop(rbp);
1530
1531 __ movptr(c_rarg0, r15_thread);
1532 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1533
1534 // rax still holds the original exception oop, save it before the call
1535 __ push(rax);
1536
1537 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1538 __ movptr(rbx, rax);
1539
1540 // Continue at exception handler:
1541 // rax: exception oop
1542 // rbx: exception handler
1543 // rdx: exception pc
1544 __ pop(rax);
1545 __ verify_oop(rax);
1546 __ pop(rdx);
1547 __ jmp(rbx);
1548 }
1549
1550 static void gen_continuation_yield(MacroAssembler* masm,
1551 const VMRegPair* regs,
1552 OopMapSet* oop_maps,
1553 int& frame_complete,
1554 int& stack_slots,
1555 int& compiled_entry_offset) {
1556 enum layout {
1557 rbp_off,
1558 rbpH_off,
1559 return_off,
1560 return_off2,
1561 framesize // inclusive of return address
1562 };
1563 stack_slots = framesize / VMRegImpl::slots_per_word;
1564 assert(stack_slots == 2, "recheck layout");
1565
1566 address start = __ pc();
1567 compiled_entry_offset = __ pc() - start;
1568 __ enter();
1569 address the_pc = __ pc();
1570
1571 frame_complete = the_pc - start;
1572
1573 // This nop must be exactly at the PC we push into the frame info.
1574 // We use this nop for fast CodeBlob lookup, associate the OopMap
1575 // with it right away.
1576 __ post_call_nop();
1577 OopMap* map = new OopMap(framesize, 1);
1578 oop_maps->add_gc_map(frame_complete, map);
1579
1580 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1581 __ movptr(c_rarg0, r15_thread);
1582 __ movptr(c_rarg1, rsp);
1583 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1584 __ reset_last_Java_frame(true);
1585
1586 Label L_pinned;
1587
1588 __ testptr(rax, rax);
1589 __ jcc(Assembler::notZero, L_pinned);
1590
1591 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1592 continuation_enter_cleanup(masm);
1593 __ pop(rbp);
1594 __ ret(0);
1595
1596 __ bind(L_pinned);
1597
1598 // Pinned, return to caller
1599
1600 // handle pending exception thrown by freeze
1601 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1602 Label ok;
1603 __ jcc(Assembler::equal, ok);
1604 __ leave();
1605 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1606 __ bind(ok);
1607
1608 __ leave();
1609 __ ret(0);
1610 }
1611
1612 static void gen_special_dispatch(MacroAssembler* masm,
1613 const methodHandle& method,
1614 const BasicType* sig_bt,
1615 const VMRegPair* regs) {
1616 verify_oop_args(masm, method, sig_bt, regs);
1617 vmIntrinsics::ID iid = method->intrinsic_id();
1618
1619 // Now write the args into the outgoing interpreter space
1620 bool has_receiver = false;
1621 Register receiver_reg = noreg;
1622 int member_arg_pos = -1;
1623 Register member_reg = noreg;
1624 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1625 if (ref_kind != 0) {
1626 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1627 member_reg = rbx; // known to be free at this point
1628 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1629 } else if (iid == vmIntrinsics::_invokeBasic) {
1630 has_receiver = true;
1631 } else if (iid == vmIntrinsics::_linkToNative) {
1632 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1633 member_reg = rbx; // known to be free at this point
1634 } else {
1635 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1636 }
1637
1638 if (member_reg != noreg) {
1639 // Load the member_arg into register, if necessary.
1640 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1641 VMReg r = regs[member_arg_pos].first();
1642 if (r->is_stack()) {
1643 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1644 } else {
1645 // no data motion is needed
1646 member_reg = r->as_Register();
1647 }
1648 }
1649
1650 if (has_receiver) {
1651 // Make sure the receiver is loaded into a register.
1652 assert(method->size_of_parameters() > 0, "oob");
1653 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1654 VMReg r = regs[0].first();
1655 assert(r->is_valid(), "bad receiver arg");
1656 if (r->is_stack()) {
1657 // Porting note: This assumes that compiled calling conventions always
1658 // pass the receiver oop in a register. If this is not true on some
1659 // platform, pick a temp and load the receiver from stack.
1660 fatal("receiver always in a register");
1661 receiver_reg = j_rarg0; // known to be free at this point
1662 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1663 } else {
1664 // no data motion is needed
1665 receiver_reg = r->as_Register();
1666 }
1667 }
1668
1669 // Figure out which address we are really jumping to:
1670 MethodHandles::generate_method_handle_dispatch(masm, iid,
1671 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1672 }
1673
1674 // ---------------------------------------------------------------------------
1675 // Generate a native wrapper for a given method. The method takes arguments
1676 // in the Java compiled code convention, marshals them to the native
1677 // convention (handlizes oops, etc), transitions to native, makes the call,
1678 // returns to java state (possibly blocking), unhandlizes any result and
1679 // returns.
1680 //
1681 // Critical native functions are a shorthand for the use of
1682 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1683 // functions. The wrapper is expected to unpack the arguments before
1684 // passing them to the callee. Critical native functions leave the state _in_Java,
1685 // since they cannot stop for GC.
1686 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1687 // block and the check for pending exceptions it's impossible for them
1688 // to be thrown.
1689 //
1690 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1691 const methodHandle& method,
1692 int compile_id,
1693 BasicType* in_sig_bt,
1694 VMRegPair* in_regs,
1695 BasicType ret_type) {
1696 if (method->is_continuation_native_intrinsic()) {
1697 int exception_offset = -1;
1698 OopMapSet* oop_maps = new OopMapSet();
1699 int frame_complete = -1;
1700 int stack_slots = -1;
1701 int interpreted_entry_offset = -1;
1702 int vep_offset = -1;
1703 if (method->is_continuation_enter_intrinsic()) {
1704 gen_continuation_enter(masm,
1705 in_regs,
1706 exception_offset,
1707 oop_maps,
1708 frame_complete,
1709 stack_slots,
1710 interpreted_entry_offset,
1711 vep_offset);
1712 } else if (method->is_continuation_yield_intrinsic()) {
1713 gen_continuation_yield(masm,
1714 in_regs,
1715 oop_maps,
1716 frame_complete,
1717 stack_slots,
1718 vep_offset);
1719 } else {
1720 guarantee(false, "Unknown Continuation native intrinsic");
1721 }
1722
1723 #ifdef ASSERT
1724 if (method->is_continuation_enter_intrinsic()) {
1725 assert(interpreted_entry_offset != -1, "Must be set");
1726 assert(exception_offset != -1, "Must be set");
1727 } else {
1728 assert(interpreted_entry_offset == -1, "Must be unset");
1729 assert(exception_offset == -1, "Must be unset");
1730 }
1731 assert(frame_complete != -1, "Must be set");
1732 assert(stack_slots != -1, "Must be set");
1733 assert(vep_offset != -1, "Must be set");
1734 #endif
1735
1736 __ flush();
1737 nmethod* nm = nmethod::new_native_nmethod(method,
1738 compile_id,
1739 masm->code(),
1740 vep_offset,
1741 frame_complete,
1742 stack_slots,
1743 in_ByteSize(-1),
1744 in_ByteSize(-1),
1745 oop_maps,
1746 exception_offset);
1747 if (method->is_continuation_enter_intrinsic()) {
1748 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1749 } else if (method->is_continuation_yield_intrinsic()) {
1750 _cont_doYield_stub = nm;
1751 }
1752 return nm;
1753 }
1754
1755 if (method->is_method_handle_intrinsic()) {
1756 vmIntrinsics::ID iid = method->intrinsic_id();
1757 intptr_t start = (intptr_t)__ pc();
1758 int vep_offset = ((intptr_t)__ pc()) - start;
1759 gen_special_dispatch(masm,
1760 method,
1761 in_sig_bt,
1762 in_regs);
1763 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1764 __ flush();
1765 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1766 return nmethod::new_native_nmethod(method,
1767 compile_id,
1768 masm->code(),
1769 vep_offset,
1770 frame_complete,
1771 stack_slots / VMRegImpl::slots_per_word,
1772 in_ByteSize(-1),
1773 in_ByteSize(-1),
1774 nullptr);
1775 }
1776 address native_func = method->native_function();
1777 assert(native_func != nullptr, "must have function");
1778
1779 // An OopMap for lock (and class if static)
1780 OopMapSet *oop_maps = new OopMapSet();
1781 intptr_t start = (intptr_t)__ pc();
1782
1783 // We have received a description of where all the java arg are located
1784 // on entry to the wrapper. We need to convert these args to where
1785 // the jni function will expect them. To figure out where they go
1786 // we convert the java signature to a C signature by inserting
1787 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1788
1789 const int total_in_args = method->size_of_parameters();
1790 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1791
1792 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1793 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1794 BasicType* in_elem_bt = nullptr;
1795
1796 int argc = 0;
1797 out_sig_bt[argc++] = T_ADDRESS;
1798 if (method->is_static()) {
1799 out_sig_bt[argc++] = T_OBJECT;
1800 }
1801
1802 for (int i = 0; i < total_in_args ; i++ ) {
1803 out_sig_bt[argc++] = in_sig_bt[i];
1804 }
1805
1806 // Now figure out where the args must be stored and how much stack space
1807 // they require.
1808 int out_arg_slots;
1809 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, nullptr, total_c_args);
1810
1811 // Compute framesize for the wrapper. We need to handlize all oops in
1812 // incoming registers
1813
1814 // Calculate the total number of stack slots we will need.
1815
1816 // First count the abi requirement plus all of the outgoing args
1817 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1818
1819 // Now the space for the inbound oop handle area
1820 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1821
1822 int oop_handle_offset = stack_slots;
1823 stack_slots += total_save_slots;
1824
1825 // Now any space we need for handlizing a klass if static method
1826
1827 int klass_slot_offset = 0;
1828 int klass_offset = -1;
1829 int lock_slot_offset = 0;
1830 bool is_static = false;
1831
1832 if (method->is_static()) {
1833 klass_slot_offset = stack_slots;
1834 stack_slots += VMRegImpl::slots_per_word;
1835 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1836 is_static = true;
1837 }
1838
1839 // Plus a lock if needed
1840
1841 if (method->is_synchronized()) {
1842 lock_slot_offset = stack_slots;
1843 stack_slots += VMRegImpl::slots_per_word;
1844 }
1845
1846 // Now a place (+2) to save return values or temp during shuffling
1847 // + 4 for return address (which we own) and saved rbp
1848 stack_slots += 6;
1849
1850 // Ok The space we have allocated will look like:
1851 //
1852 //
1853 // FP-> | |
1854 // |---------------------|
1855 // | 2 slots for moves |
1856 // |---------------------|
1857 // | lock box (if sync) |
1858 // |---------------------| <- lock_slot_offset
1859 // | klass (if static) |
1860 // |---------------------| <- klass_slot_offset
1861 // | oopHandle area |
1862 // |---------------------| <- oop_handle_offset (6 java arg registers)
1863 // | outbound memory |
1864 // | based arguments |
1865 // | |
1866 // |---------------------|
1867 // | |
1868 // SP-> | out_preserved_slots |
1869 //
1870 //
1871
1872
1873 // Now compute actual number of stack words we need rounding to make
1874 // stack properly aligned.
1875 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1876
1877 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1878
1879 // First thing make an ic check to see if we should even be here
1880
1881 // We are free to use all registers as temps without saving them and
1882 // restoring them except rbp. rbp is the only callee save register
1883 // as far as the interpreter and the compiler(s) are concerned.
1884
1885
1886 const Register ic_reg = rax;
1887 const Register receiver = j_rarg0;
1888
1889 Label hit;
1890 Label exception_pending;
1891
1892 assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
1893 __ verify_oop(receiver);
1894 __ load_klass(rscratch1, receiver, rscratch2);
1895 __ cmpq(ic_reg, rscratch1);
1896 __ jcc(Assembler::equal, hit);
1897
1898 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1899
1900 // Verified entry point must be aligned
1901 __ align(8);
1902
1903 __ bind(hit);
1904
1905 int vep_offset = ((intptr_t)__ pc()) - start;
1906
1907 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1908 Label L_skip_barrier;
1909 Register klass = r10;
1910 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1911 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1912
1913 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1914
1915 __ bind(L_skip_barrier);
1916 }
1917
1918 #ifdef COMPILER1
1919 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1920 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1921 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1922 }
1923 #endif // COMPILER1
1924
1925 // The instruction at the verified entry point must be 5 bytes or longer
1926 // because it can be patched on the fly by make_non_entrant. The stack bang
1927 // instruction fits that requirement.
1928
1929 // Generate stack overflow check
1930 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1931
1932 // Generate a new frame for the wrapper.
1933 __ enter();
1934 // -2 because return address is already present and so is saved rbp
1935 __ subptr(rsp, stack_size - 2*wordSize);
1936
1937 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1938 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1939 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1940
1941 // Frame is now completed as far as size and linkage.
1942 int frame_complete = ((intptr_t)__ pc()) - start;
1943
1944 if (UseRTMLocking) {
1945 // Abort RTM transaction before calling JNI
1946 // because critical section will be large and will be
1947 // aborted anyway. Also nmethod could be deoptimized.
1948 __ xabort(0);
1949 }
1950
1951 #ifdef ASSERT
1952 __ check_stack_alignment(rsp, "improperly aligned stack");
1953 #endif /* ASSERT */
1954
1955
1956 // We use r14 as the oop handle for the receiver/klass
1957 // It is callee save so it survives the call to native
1958
1959 const Register oop_handle_reg = r14;
1960
1961 //
1962 // We immediately shuffle the arguments so that any vm call we have to
1963 // make from here on out (sync slow path, jvmti, etc.) we will have
1964 // captured the oops from our caller and have a valid oopMap for
1965 // them.
1966
1967 // -----------------
1968 // The Grand Shuffle
1969
1970 // The Java calling convention is either equal (linux) or denser (win64) than the
1971 // c calling convention. However the because of the jni_env argument the c calling
1972 // convention always has at least one more (and two for static) arguments than Java.
1973 // Therefore if we move the args from java -> c backwards then we will never have
1974 // a register->register conflict and we don't have to build a dependency graph
1975 // and figure out how to break any cycles.
1976 //
1977
1978 // Record esp-based slot for receiver on stack for non-static methods
1979 int receiver_offset = -1;
1980
1981 // This is a trick. We double the stack slots so we can claim
1982 // the oops in the caller's frame. Since we are sure to have
1983 // more args than the caller doubling is enough to make
1984 // sure we can capture all the incoming oop args from the
1985 // caller.
1986 //
1987 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1988
1989 // Mark location of rbp (someday)
1990 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1991
1992 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1993 // All inbound args are referenced based on rbp and all outbound args via rsp.
1994
1995
1996 #ifdef ASSERT
1997 bool reg_destroyed[Register::number_of_registers];
1998 bool freg_destroyed[XMMRegister::number_of_registers];
1999 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2000 reg_destroyed[r] = false;
2001 }
2002 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2003 freg_destroyed[f] = false;
2004 }
2005
2006 #endif /* ASSERT */
2007
2008 // For JNI natives the incoming and outgoing registers are offset upwards.
2009 GrowableArray<int> arg_order(2 * total_in_args);
2010
2011 VMRegPair tmp_vmreg;
2012 tmp_vmreg.set2(rbx->as_VMReg());
2013
2014 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2015 arg_order.push(i);
2016 arg_order.push(c_arg);
2017 }
2018
2019 int temploc = -1;
2020 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2021 int i = arg_order.at(ai);
2022 int c_arg = arg_order.at(ai + 1);
2023 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2024 #ifdef ASSERT
2025 if (in_regs[i].first()->is_Register()) {
2026 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2027 } else if (in_regs[i].first()->is_XMMRegister()) {
2028 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2029 }
2030 if (out_regs[c_arg].first()->is_Register()) {
2031 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2032 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2033 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2034 }
2035 #endif /* ASSERT */
2036 switch (in_sig_bt[i]) {
2037 case T_ARRAY:
2038 case T_OBJECT:
2039 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2040 ((i == 0) && (!is_static)),
2041 &receiver_offset);
2042 break;
2043 case T_VOID:
2044 break;
2045
2046 case T_FLOAT:
2047 __ float_move(in_regs[i], out_regs[c_arg]);
2048 break;
2049
2050 case T_DOUBLE:
2051 assert( i + 1 < total_in_args &&
2052 in_sig_bt[i + 1] == T_VOID &&
2053 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2054 __ double_move(in_regs[i], out_regs[c_arg]);
2055 break;
2056
2057 case T_LONG :
2058 __ long_move(in_regs[i], out_regs[c_arg]);
2059 break;
2060
2061 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2062
2063 default:
2064 __ move32_64(in_regs[i], out_regs[c_arg]);
2065 }
2066 }
2067
2068 int c_arg;
2069
2070 // Pre-load a static method's oop into r14. Used both by locking code and
2071 // the normal JNI call code.
2072 // point c_arg at the first arg that is already loaded in case we
2073 // need to spill before we call out
2074 c_arg = total_c_args - total_in_args;
2075
2076 if (method->is_static()) {
2077
2078 // load oop into a register
2079 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2080
2081 // Now handlize the static class mirror it's known not-null.
2082 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2083 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2084
2085 // Now get the handle
2086 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2087 // store the klass handle as second argument
2088 __ movptr(c_rarg1, oop_handle_reg);
2089 // and protect the arg if we must spill
2090 c_arg--;
2091 }
2092
2093 // Change state to native (we save the return address in the thread, since it might not
2094 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2095 // points into the right code segment. It does not have to be the correct return pc.
2096 // We use the same pc/oopMap repeatedly when we call out
2097
2098 intptr_t the_pc = (intptr_t) __ pc();
2099 oop_maps->add_gc_map(the_pc - start, map);
2100
2101 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2102
2103
2104 // We have all of the arguments setup at this point. We must not touch any register
2105 // argument registers at this point (what if we save/restore them there are no oop?
2106
2107 {
2108 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2109 // protect the args we've loaded
2110 save_args(masm, total_c_args, c_arg, out_regs);
2111 __ mov_metadata(c_rarg1, method());
2112 __ call_VM_leaf(
2113 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2114 r15_thread, c_rarg1);
2115 restore_args(masm, total_c_args, c_arg, out_regs);
2116 }
2117
2118 // RedefineClasses() tracing support for obsolete method entry
2119 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2120 // protect the args we've loaded
2121 save_args(masm, total_c_args, c_arg, out_regs);
2122 __ mov_metadata(c_rarg1, method());
2123 __ call_VM_leaf(
2124 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2125 r15_thread, c_rarg1);
2126 restore_args(masm, total_c_args, c_arg, out_regs);
2127 }
2128
2129 // Lock a synchronized method
2130
2131 // Register definitions used by locking and unlocking
2132
2133 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2134 const Register obj_reg = rbx; // Will contain the oop
2135 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2136 const Register old_hdr = r13; // value of old header at unlock time
2137
2138 Label slow_path_lock;
2139 Label lock_done;
2140
2141 if (method->is_synchronized()) {
2142 Label count_mon;
2143
2144 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2145
2146 // Get the handle (the 2nd argument)
2147 __ mov(oop_handle_reg, c_rarg1);
2148
2149 // Get address of the box
2150
2151 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2152
2153 // Load the oop from the handle
2154 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2155
2156 if (LockingMode == LM_MONITOR) {
2157 __ jmp(slow_path_lock);
2158 } else if (LockingMode == LM_LEGACY) {
2159 // Load immediate 1 into swap_reg %rax
2160 __ movl(swap_reg, 1);
2161
2162 // Load (object->mark() | 1) into swap_reg %rax
2163 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2164
2165 // Save (object->mark() | 1) into BasicLock's displaced header
2166 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2167
2168 // src -> dest iff dest == rax else rax <- dest
2169 __ lock();
2170 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2171 __ jcc(Assembler::equal, count_mon);
2172
2173 // Hmm should this move to the slow path code area???
2174
2175 // Test if the oopMark is an obvious stack pointer, i.e.,
2176 // 1) (mark & 3) == 0, and
2177 // 2) rsp <= mark < mark + os::pagesize()
2178 // These 3 tests can be done by evaluating the following
2179 // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2180 // assuming both stack pointer and pagesize have their
2181 // least significant 2 bits clear.
2182 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2183
2184 __ subptr(swap_reg, rsp);
2185 __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2186
2187 // Save the test result, for recursive case, the result is zero
2188 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2189 __ jcc(Assembler::notEqual, slow_path_lock);
2190 } else {
2191 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2192 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2193 }
2194 __ bind(count_mon);
2195 __ inc_held_monitor_count();
2196
2197 // Slow path will re-enter here
2198 __ bind(lock_done);
2199 }
2200
2201 // Finally just about ready to make the JNI call
2202
2203 // get JNIEnv* which is first argument to native
2204 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2205
2206 // Now set thread in native
2207 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2208
2209 __ call(RuntimeAddress(native_func));
2210
2211 // Verify or restore cpu control state after JNI call
2212 __ restore_cpu_control_state_after_jni(rscratch1);
2213
2214 // Unpack native results.
2215 switch (ret_type) {
2216 case T_BOOLEAN: __ c2bool(rax); break;
2217 case T_CHAR : __ movzwl(rax, rax); break;
2218 case T_BYTE : __ sign_extend_byte (rax); break;
2219 case T_SHORT : __ sign_extend_short(rax); break;
2220 case T_INT : /* nothing to do */ break;
2221 case T_DOUBLE :
2222 case T_FLOAT :
2223 // Result is in xmm0 we'll save as needed
2224 break;
2225 case T_ARRAY: // Really a handle
2226 case T_OBJECT: // Really a handle
2227 break; // can't de-handlize until after safepoint check
2228 case T_VOID: break;
2229 case T_LONG: break;
2230 default : ShouldNotReachHere();
2231 }
2232
2233 Label after_transition;
2234
2235 // Switch thread to "native transition" state before reading the synchronization state.
2236 // This additional state is necessary because reading and testing the synchronization
2237 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2238 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2239 // VM thread changes sync state to synchronizing and suspends threads for GC.
2240 // Thread A is resumed to finish this native method, but doesn't block here since it
2241 // didn't see any synchronization is progress, and escapes.
2242 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2243
2244 // Force this write out before the read below
2245 if (!UseSystemMemoryBarrier) {
2246 __ membar(Assembler::Membar_mask_bits(
2247 Assembler::LoadLoad | Assembler::LoadStore |
2248 Assembler::StoreLoad | Assembler::StoreStore));
2249 }
2250
2251 // check for safepoint operation in progress and/or pending suspend requests
2252 {
2253 Label Continue;
2254 Label slow_path;
2255
2256 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2257
2258 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2259 __ jcc(Assembler::equal, Continue);
2260 __ bind(slow_path);
2261
2262 // Don't use call_VM as it will see a possible pending exception and forward it
2263 // and never return here preventing us from clearing _last_native_pc down below.
2264 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2265 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2266 // by hand.
2267 //
2268 __ vzeroupper();
2269 save_native_result(masm, ret_type, stack_slots);
2270 __ mov(c_rarg0, r15_thread);
2271 __ mov(r12, rsp); // remember sp
2272 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2273 __ andptr(rsp, -16); // align stack as required by ABI
2274 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2275 __ mov(rsp, r12); // restore sp
2276 __ reinit_heapbase();
2277 // Restore any method result value
2278 restore_native_result(masm, ret_type, stack_slots);
2279 __ bind(Continue);
2280 }
2281
2282 // change thread state
2283 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2284 __ bind(after_transition);
2285
2286 Label reguard;
2287 Label reguard_done;
2288 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2289 __ jcc(Assembler::equal, reguard);
2290 __ bind(reguard_done);
2291
2292 // native result if any is live
2293
2294 // Unlock
2295 Label slow_path_unlock;
2296 Label unlock_done;
2297 if (method->is_synchronized()) {
2298
2299 Label fast_done;
2300
2301 // Get locked oop from the handle we passed to jni
2302 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2303
2304 if (LockingMode == LM_LEGACY) {
2305 Label not_recur;
2306 // Simple recursive lock?
2307 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2308 __ jcc(Assembler::notEqual, not_recur);
2309 __ dec_held_monitor_count();
2310 __ jmpb(fast_done);
2311 __ bind(not_recur);
2312 }
2313
2314 // Must save rax if it is live now because cmpxchg must use it
2315 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2316 save_native_result(masm, ret_type, stack_slots);
2317 }
2318
2319 if (LockingMode == LM_MONITOR) {
2320 __ jmp(slow_path_unlock);
2321 } else if (LockingMode == LM_LEGACY) {
2322 // get address of the stack lock
2323 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2324 // get old displaced header
2325 __ movptr(old_hdr, Address(rax, 0));
2326
2327 // Atomic swap old header if oop still contains the stack lock
2328 __ lock();
2329 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2330 __ jcc(Assembler::notEqual, slow_path_unlock);
2331 __ dec_held_monitor_count();
2332 } else {
2333 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2334 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2335 __ dec_held_monitor_count();
2336 }
2337
2338 // slow path re-enters here
2339 __ bind(unlock_done);
2340 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2341 restore_native_result(masm, ret_type, stack_slots);
2342 }
2343
2344 __ bind(fast_done);
2345 }
2346 {
2347 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2348 save_native_result(masm, ret_type, stack_slots);
2349 __ mov_metadata(c_rarg1, method());
2350 __ call_VM_leaf(
2351 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2352 r15_thread, c_rarg1);
2353 restore_native_result(masm, ret_type, stack_slots);
2354 }
2355
2356 __ reset_last_Java_frame(false);
2357
2358 // Unbox oop result, e.g. JNIHandles::resolve value.
2359 if (is_reference_type(ret_type)) {
2360 __ resolve_jobject(rax /* value */,
2361 r15_thread /* thread */,
2362 rcx /* tmp */);
2363 }
2364
2365 if (CheckJNICalls) {
2366 // clear_pending_jni_exception_check
2367 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2368 }
2369
2370 // reset handle block
2371 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2372 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2373
2374 // pop our frame
2375
2376 __ leave();
2377
2378 // Any exception pending?
2379 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2380 __ jcc(Assembler::notEqual, exception_pending);
2381
2382 // Return
2383
2384 __ ret(0);
2385
2386 // Unexpected paths are out of line and go here
2387
2388 // forward the exception
2389 __ bind(exception_pending);
2390
2391 // and forward the exception
2392 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2393
2394 // Slow path locking & unlocking
2395 if (method->is_synchronized()) {
2396
2397 // BEGIN Slow path lock
2398 __ bind(slow_path_lock);
2399
2400 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2401 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2402
2403 // protect the args we've loaded
2404 save_args(masm, total_c_args, c_arg, out_regs);
2405
2406 __ mov(c_rarg0, obj_reg);
2407 __ mov(c_rarg1, lock_reg);
2408 __ mov(c_rarg2, r15_thread);
2409
2410 // Not a leaf but we have last_Java_frame setup as we want
2411 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2412 restore_args(masm, total_c_args, c_arg, out_regs);
2413
2414 #ifdef ASSERT
2415 { Label L;
2416 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2417 __ jcc(Assembler::equal, L);
2418 __ stop("no pending exception allowed on exit from monitorenter");
2419 __ bind(L);
2420 }
2421 #endif
2422 __ jmp(lock_done);
2423
2424 // END Slow path lock
2425
2426 // BEGIN Slow path unlock
2427 __ bind(slow_path_unlock);
2428
2429 // If we haven't already saved the native result we must save it now as xmm registers
2430 // are still exposed.
2431 __ vzeroupper();
2432 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2433 save_native_result(masm, ret_type, stack_slots);
2434 }
2435
2436 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2437
2438 __ mov(c_rarg0, obj_reg);
2439 __ mov(c_rarg2, r15_thread);
2440 __ mov(r12, rsp); // remember sp
2441 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2442 __ andptr(rsp, -16); // align stack as required by ABI
2443
2444 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2445 // NOTE that obj_reg == rbx currently
2446 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2447 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2448
2449 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2450 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2451 __ mov(rsp, r12); // restore sp
2452 __ reinit_heapbase();
2453 #ifdef ASSERT
2454 {
2455 Label L;
2456 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2457 __ jcc(Assembler::equal, L);
2458 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2459 __ bind(L);
2460 }
2461 #endif /* ASSERT */
2462
2463 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2464
2465 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2466 restore_native_result(masm, ret_type, stack_slots);
2467 }
2468 __ jmp(unlock_done);
2469
2470 // END Slow path unlock
2471
2472 } // synchronized
2473
2474 // SLOW PATH Reguard the stack if needed
2475
2476 __ bind(reguard);
2477 __ vzeroupper();
2478 save_native_result(masm, ret_type, stack_slots);
2479 __ mov(r12, rsp); // remember sp
2480 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2481 __ andptr(rsp, -16); // align stack as required by ABI
2482 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2483 __ mov(rsp, r12); // restore sp
2484 __ reinit_heapbase();
2485 restore_native_result(masm, ret_type, stack_slots);
2486 // and continue
2487 __ jmp(reguard_done);
2488
2489
2490
2491 __ flush();
2492
2493 nmethod *nm = nmethod::new_native_nmethod(method,
2494 compile_id,
2495 masm->code(),
2496 vep_offset,
2497 frame_complete,
2498 stack_slots / VMRegImpl::slots_per_word,
2499 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2500 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2501 oop_maps);
2502
2503 return nm;
2504 }
2505
2506 // this function returns the adjust size (in number of words) to a c2i adapter
2507 // activation for use during deoptimization
2508 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2509 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2510 }
2511
2512
2513 uint SharedRuntime::out_preserve_stack_slots() {
2514 return 0;
2515 }
2516
2517
2518 // Number of stack slots between incoming argument block and the start of
2519 // a new frame. The PROLOG must add this many slots to the stack. The
2520 // EPILOG must remove this many slots. amd64 needs two slots for
2521 // return address.
2522 uint SharedRuntime::in_preserve_stack_slots() {
2523 return 4 + 2 * VerifyStackAtCalls;
2524 }
2525
2526 //------------------------------generate_deopt_blob----------------------------
2527 void SharedRuntime::generate_deopt_blob() {
2528 // Allocate space for the code
2529 ResourceMark rm;
2530 // Setup code generation tools
2531 int pad = 0;
2532 if (UseAVX > 2) {
2533 pad += 1024;
2534 }
2535 #if INCLUDE_JVMCI
2536 if (EnableJVMCI) {
2537 pad += 512; // Increase the buffer size when compiling for JVMCI
2538 }
2539 #endif
2540 CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2541 MacroAssembler* masm = new MacroAssembler(&buffer);
2542 int frame_size_in_words;
2543 OopMap* map = nullptr;
2544 OopMapSet *oop_maps = new OopMapSet();
2545
2546 // -------------
2547 // This code enters when returning to a de-optimized nmethod. A return
2548 // address has been pushed on the stack, and return values are in
2549 // registers.
2550 // If we are doing a normal deopt then we were called from the patched
2551 // nmethod from the point we returned to the nmethod. So the return
2552 // address on the stack is wrong by NativeCall::instruction_size
2553 // We will adjust the value so it looks like we have the original return
2554 // address on the stack (like when we eagerly deoptimized).
2555 // In the case of an exception pending when deoptimizing, we enter
2556 // with a return address on the stack that points after the call we patched
2557 // into the exception handler. We have the following register state from,
2558 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2559 // rax: exception oop
2560 // rbx: exception handler
2561 // rdx: throwing pc
2562 // So in this case we simply jam rdx into the useless return address and
2563 // the stack looks just like we want.
2564 //
2565 // At this point we need to de-opt. We save the argument return
2566 // registers. We call the first C routine, fetch_unroll_info(). This
2567 // routine captures the return values and returns a structure which
2568 // describes the current frame size and the sizes of all replacement frames.
2569 // The current frame is compiled code and may contain many inlined
2570 // functions, each with their own JVM state. We pop the current frame, then
2571 // push all the new frames. Then we call the C routine unpack_frames() to
2572 // populate these frames. Finally unpack_frames() returns us the new target
2573 // address. Notice that callee-save registers are BLOWN here; they have
2574 // already been captured in the vframeArray at the time the return PC was
2575 // patched.
2576 address start = __ pc();
2577 Label cont;
2578
2579 // Prolog for non exception case!
2580
2581 // Save everything in sight.
2582 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2583
2584 // Normal deoptimization. Save exec mode for unpack_frames.
2585 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2586 __ jmp(cont);
2587
2588 int reexecute_offset = __ pc() - start;
2589 #if INCLUDE_JVMCI && !defined(COMPILER1)
2590 if (EnableJVMCI && UseJVMCICompiler) {
2591 // JVMCI does not use this kind of deoptimization
2592 __ should_not_reach_here();
2593 }
2594 #endif
2595
2596 // Reexecute case
2597 // return address is the pc describes what bci to do re-execute at
2598
2599 // No need to update map as each call to save_live_registers will produce identical oopmap
2600 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2601
2602 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2603 __ jmp(cont);
2604
2605 #if INCLUDE_JVMCI
2606 Label after_fetch_unroll_info_call;
2607 int implicit_exception_uncommon_trap_offset = 0;
2608 int uncommon_trap_offset = 0;
2609
2610 if (EnableJVMCI) {
2611 implicit_exception_uncommon_trap_offset = __ pc() - start;
2612
2613 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2614 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2615
2616 uncommon_trap_offset = __ pc() - start;
2617
2618 // Save everything in sight.
2619 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2620 // fetch_unroll_info needs to call last_java_frame()
2621 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2622
2623 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2624 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2625
2626 __ movl(r14, Deoptimization::Unpack_reexecute);
2627 __ mov(c_rarg0, r15_thread);
2628 __ movl(c_rarg2, r14); // exec mode
2629 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2630 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2631
2632 __ reset_last_Java_frame(false);
2633
2634 __ jmp(after_fetch_unroll_info_call);
2635 } // EnableJVMCI
2636 #endif // INCLUDE_JVMCI
2637
2638 int exception_offset = __ pc() - start;
2639
2640 // Prolog for exception case
2641
2642 // all registers are dead at this entry point, except for rax, and
2643 // rdx which contain the exception oop and exception pc
2644 // respectively. Set them in TLS and fall thru to the
2645 // unpack_with_exception_in_tls entry point.
2646
2647 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2648 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2649
2650 int exception_in_tls_offset = __ pc() - start;
2651
2652 // new implementation because exception oop is now passed in JavaThread
2653
2654 // Prolog for exception case
2655 // All registers must be preserved because they might be used by LinearScan
2656 // Exceptiop oop and throwing PC are passed in JavaThread
2657 // tos: stack at point of call to method that threw the exception (i.e. only
2658 // args are on the stack, no return address)
2659
2660 // make room on stack for the return address
2661 // It will be patched later with the throwing pc. The correct value is not
2662 // available now because loading it from memory would destroy registers.
2663 __ push(0);
2664
2665 // Save everything in sight.
2666 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2667
2668 // Now it is safe to overwrite any register
2669
2670 // Deopt during an exception. Save exec mode for unpack_frames.
2671 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2672
2673 // load throwing pc from JavaThread and patch it as the return address
2674 // of the current frame. Then clear the field in JavaThread
2675
2676 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2677 __ movptr(Address(rbp, wordSize), rdx);
2678 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2679
2680 #ifdef ASSERT
2681 // verify that there is really an exception oop in JavaThread
2682 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2683 __ verify_oop(rax);
2684
2685 // verify that there is no pending exception
2686 Label no_pending_exception;
2687 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2688 __ testptr(rax, rax);
2689 __ jcc(Assembler::zero, no_pending_exception);
2690 __ stop("must not have pending exception here");
2691 __ bind(no_pending_exception);
2692 #endif
2693
2694 __ bind(cont);
2695
2696 // Call C code. Need thread and this frame, but NOT official VM entry
2697 // crud. We cannot block on this call, no GC can happen.
2698 //
2699 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2700
2701 // fetch_unroll_info needs to call last_java_frame().
2702
2703 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2704 #ifdef ASSERT
2705 { Label L;
2706 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2707 __ jcc(Assembler::equal, L);
2708 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2709 __ bind(L);
2710 }
2711 #endif // ASSERT
2712 __ mov(c_rarg0, r15_thread);
2713 __ movl(c_rarg1, r14); // exec_mode
2714 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2715
2716 // Need to have an oopmap that tells fetch_unroll_info where to
2717 // find any register it might need.
2718 oop_maps->add_gc_map(__ pc() - start, map);
2719
2720 __ reset_last_Java_frame(false);
2721
2722 #if INCLUDE_JVMCI
2723 if (EnableJVMCI) {
2724 __ bind(after_fetch_unroll_info_call);
2725 }
2726 #endif
2727
2728 // Load UnrollBlock* into rdi
2729 __ mov(rdi, rax);
2730
2731 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2732 Label noException;
2733 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2734 __ jcc(Assembler::notEqual, noException);
2735 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2736 // QQQ this is useless it was null above
2737 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2738 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2739 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2740
2741 __ verify_oop(rax);
2742
2743 // Overwrite the result registers with the exception results.
2744 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2745 // I think this is useless
2746 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2747
2748 __ bind(noException);
2749
2750 // Only register save data is on the stack.
2751 // Now restore the result registers. Everything else is either dead
2752 // or captured in the vframeArray.
2753 RegisterSaver::restore_result_registers(masm);
2754
2755 // All of the register save area has been popped of the stack. Only the
2756 // return address remains.
2757
2758 // Pop all the frames we must move/replace.
2759 //
2760 // Frame picture (youngest to oldest)
2761 // 1: self-frame (no frame link)
2762 // 2: deopting frame (no frame link)
2763 // 3: caller of deopting frame (could be compiled/interpreted).
2764 //
2765 // Note: by leaving the return address of self-frame on the stack
2766 // and using the size of frame 2 to adjust the stack
2767 // when we are done the return to frame 3 will still be on the stack.
2768
2769 // Pop deoptimized frame
2770 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2771 __ addptr(rsp, rcx);
2772
2773 // rsp should be pointing at the return address to the caller (3)
2774
2775 // Pick up the initial fp we should save
2776 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2777 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2778
2779 #ifdef ASSERT
2780 // Compilers generate code that bang the stack by as much as the
2781 // interpreter would need. So this stack banging should never
2782 // trigger a fault. Verify that it does not on non product builds.
2783 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2784 __ bang_stack_size(rbx, rcx);
2785 #endif
2786
2787 // Load address of array of frame pcs into rcx
2788 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2789
2790 // Trash the old pc
2791 __ addptr(rsp, wordSize);
2792
2793 // Load address of array of frame sizes into rsi
2794 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2795
2796 // Load counter into rdx
2797 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2798
2799 // Now adjust the caller's stack to make up for the extra locals
2800 // but record the original sp so that we can save it in the skeletal interpreter
2801 // frame and the stack walking of interpreter_sender will get the unextended sp
2802 // value and not the "real" sp value.
2803
2804 const Register sender_sp = r8;
2805
2806 __ mov(sender_sp, rsp);
2807 __ movl(rbx, Address(rdi,
2808 Deoptimization::UnrollBlock::
2809 caller_adjustment_offset()));
2810 __ subptr(rsp, rbx);
2811
2812 // Push interpreter frames in a loop
2813 Label loop;
2814 __ bind(loop);
2815 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2816 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2817 __ pushptr(Address(rcx, 0)); // Save return address
2818 __ enter(); // Save old & set new ebp
2819 __ subptr(rsp, rbx); // Prolog
2820 // This value is corrected by layout_activation_impl
2821 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2822 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2823 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2824 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2825 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2826 __ decrementl(rdx); // Decrement counter
2827 __ jcc(Assembler::notZero, loop);
2828 __ pushptr(Address(rcx, 0)); // Save final return address
2829
2830 // Re-push self-frame
2831 __ enter(); // Save old & set new ebp
2832
2833 // Allocate a full sized register save area.
2834 // Return address and rbp are in place, so we allocate two less words.
2835 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2836
2837 // Restore frame locals after moving the frame
2838 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2839 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2840
2841 // Call C code. Need thread but NOT official VM entry
2842 // crud. We cannot block on this call, no GC can happen. Call should
2843 // restore return values to their stack-slots with the new SP.
2844 //
2845 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2846
2847 // Use rbp because the frames look interpreted now
2848 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2849 // Don't need the precise return PC here, just precise enough to point into this code blob.
2850 address the_pc = __ pc();
2851 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2852
2853 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2854 __ mov(c_rarg0, r15_thread);
2855 __ movl(c_rarg1, r14); // second arg: exec_mode
2856 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2857 // Revert SP alignment after call since we're going to do some SP relative addressing below
2858 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2859
2860 // Set an oopmap for the call site
2861 // Use the same PC we used for the last java frame
2862 oop_maps->add_gc_map(the_pc - start,
2863 new OopMap( frame_size_in_words, 0 ));
2864
2865 // Clear fp AND pc
2866 __ reset_last_Java_frame(true);
2867
2868 // Collect return values
2869 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2870 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2871 // I think this is useless (throwing pc?)
2872 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2873
2874 // Pop self-frame.
2875 __ leave(); // Epilog
2876
2877 // Jump to interpreter
2878 __ ret(0);
2879
2880 // Make sure all code is generated
2881 masm->flush();
2882
2883 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2884 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2885 #if INCLUDE_JVMCI
2886 if (EnableJVMCI) {
2887 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2888 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2889 }
2890 #endif
2891 }
2892
2893 #ifdef COMPILER2
2894 //------------------------------generate_uncommon_trap_blob--------------------
2895 void SharedRuntime::generate_uncommon_trap_blob() {
2896 // Allocate space for the code
2897 ResourceMark rm;
2898 // Setup code generation tools
2899 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2900 MacroAssembler* masm = new MacroAssembler(&buffer);
2901
2902 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2903
2904 address start = __ pc();
2905
2906 if (UseRTMLocking) {
2907 // Abort RTM transaction before possible nmethod deoptimization.
2908 __ xabort(0);
2909 }
2910
2911 // Push self-frame. We get here with a return address on the
2912 // stack, so rsp is 8-byte aligned until we allocate our frame.
2913 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2914
2915 // No callee saved registers. rbp is assumed implicitly saved
2916 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2917
2918 // compiler left unloaded_class_index in j_rarg0 move to where the
2919 // runtime expects it.
2920 __ movl(c_rarg1, j_rarg0);
2921
2922 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2923
2924 // Call C code. Need thread but NOT official VM entry
2925 // crud. We cannot block on this call, no GC can happen. Call should
2926 // capture callee-saved registers as well as return values.
2927 // Thread is in rdi already.
2928 //
2929 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2930
2931 __ mov(c_rarg0, r15_thread);
2932 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2933 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2934
2935 // Set an oopmap for the call site
2936 OopMapSet* oop_maps = new OopMapSet();
2937 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2938
2939 // location of rbp is known implicitly by the frame sender code
2940
2941 oop_maps->add_gc_map(__ pc() - start, map);
2942
2943 __ reset_last_Java_frame(false);
2944
2945 // Load UnrollBlock* into rdi
2946 __ mov(rdi, rax);
2947
2948 #ifdef ASSERT
2949 { Label L;
2950 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2951 Deoptimization::Unpack_uncommon_trap);
2952 __ jcc(Assembler::equal, L);
2953 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2954 __ bind(L);
2955 }
2956 #endif
2957
2958 // Pop all the frames we must move/replace.
2959 //
2960 // Frame picture (youngest to oldest)
2961 // 1: self-frame (no frame link)
2962 // 2: deopting frame (no frame link)
2963 // 3: caller of deopting frame (could be compiled/interpreted).
2964
2965 // Pop self-frame. We have no frame, and must rely only on rax and rsp.
2966 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2967
2968 // Pop deoptimized frame (int)
2969 __ movl(rcx, Address(rdi,
2970 Deoptimization::UnrollBlock::
2971 size_of_deoptimized_frame_offset()));
2972 __ addptr(rsp, rcx);
2973
2974 // rsp should be pointing at the return address to the caller (3)
2975
2976 // Pick up the initial fp we should save
2977 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2978 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2979
2980 #ifdef ASSERT
2981 // Compilers generate code that bang the stack by as much as the
2982 // interpreter would need. So this stack banging should never
2983 // trigger a fault. Verify that it does not on non product builds.
2984 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2985 __ bang_stack_size(rbx, rcx);
2986 #endif
2987
2988 // Load address of array of frame pcs into rcx (address*)
2989 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2990
2991 // Trash the return pc
2992 __ addptr(rsp, wordSize);
2993
2994 // Load address of array of frame sizes into rsi (intptr_t*)
2995 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2996
2997 // Counter
2998 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2999
3000 // Now adjust the caller's stack to make up for the extra locals but
3001 // record the original sp so that we can save it in the skeletal
3002 // interpreter frame and the stack walking of interpreter_sender
3003 // will get the unextended sp value and not the "real" sp value.
3004
3005 const Register sender_sp = r8;
3006
3007 __ mov(sender_sp, rsp);
3008 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3009 __ subptr(rsp, rbx);
3010
3011 // Push interpreter frames in a loop
3012 Label loop;
3013 __ bind(loop);
3014 __ movptr(rbx, Address(rsi, 0)); // Load frame size
3015 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand
3016 __ pushptr(Address(rcx, 0)); // Save return address
3017 __ enter(); // Save old & set new rbp
3018 __ subptr(rsp, rbx); // Prolog
3019 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3020 sender_sp); // Make it walkable
3021 // This value is corrected by layout_activation_impl
3022 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3023 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
3024 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
3025 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
3026 __ decrementl(rdx); // Decrement counter
3027 __ jcc(Assembler::notZero, loop);
3028 __ pushptr(Address(rcx, 0)); // Save final return address
3029
3030 // Re-push self-frame
3031 __ enter(); // Save old & set new rbp
3032 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3033 // Prolog
3034
3035 // Use rbp because the frames look interpreted now
3036 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3037 // Don't need the precise return PC here, just precise enough to point into this code blob.
3038 address the_pc = __ pc();
3039 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3040
3041 // Call C code. Need thread but NOT official VM entry
3042 // crud. We cannot block on this call, no GC can happen. Call should
3043 // restore return values to their stack-slots with the new SP.
3044 // Thread is in rdi already.
3045 //
3046 // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3047
3048 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3049 __ mov(c_rarg0, r15_thread);
3050 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3051 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3052
3053 // Set an oopmap for the call site
3054 // Use the same PC we used for the last java frame
3055 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3056
3057 // Clear fp AND pc
3058 __ reset_last_Java_frame(true);
3059
3060 // Pop self-frame.
3061 __ leave(); // Epilog
3062
3063 // Jump to interpreter
3064 __ ret(0);
3065
3066 // Make sure all code is generated
3067 masm->flush();
3068
3069 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps,
3070 SimpleRuntimeFrame::framesize >> 1);
3071 }
3072 #endif // COMPILER2
3073
3074 //------------------------------generate_handler_blob------
3075 //
3076 // Generate a special Compile2Runtime blob that saves all registers,
3077 // and setup oopmap.
3078 //
3079 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3080 assert(StubRoutines::forward_exception_entry() != nullptr,
3081 "must be generated before");
3082
3083 ResourceMark rm;
3084 OopMapSet *oop_maps = new OopMapSet();
3085 OopMap* map;
3086
3087 // Allocate space for the code. Setup code generation tools.
3088 CodeBuffer buffer("handler_blob", 2048, 1024);
3089 MacroAssembler* masm = new MacroAssembler(&buffer);
3090
3091 address start = __ pc();
3092 address call_pc = nullptr;
3093 int frame_size_in_words;
3094 bool cause_return = (poll_type == POLL_AT_RETURN);
3095 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3096
3097 if (UseRTMLocking) {
3098 // Abort RTM transaction before calling runtime
3099 // because critical section will be large and will be
3100 // aborted anyway. Also nmethod could be deoptimized.
3101 __ xabort(0);
3102 }
3103
3104 // Make room for return address (or push it again)
3105 if (!cause_return) {
3106 __ push(rbx);
3107 }
3108
3109 // Save registers, fpu state, and flags
3110 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3111
3112 // The following is basically a call_VM. However, we need the precise
3113 // address of the call in order to generate an oopmap. Hence, we do all the
3114 // work ourselves.
3115
3116 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3117
3118 // The return address must always be correct so that frame constructor never
3119 // sees an invalid pc.
3120
3121 if (!cause_return) {
3122 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3123 // Additionally, rbx is a callee saved register and we can look at it later to determine
3124 // if someone changed the return address for us!
3125 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3126 __ movptr(Address(rbp, wordSize), rbx);
3127 }
3128
3129 // Do the call
3130 __ mov(c_rarg0, r15_thread);
3131 __ call(RuntimeAddress(call_ptr));
3132
3133 // Set an oopmap for the call site. This oopmap will map all
3134 // oop-registers and debug-info registers as callee-saved. This
3135 // will allow deoptimization at this safepoint to find all possible
3136 // debug-info recordings, as well as let GC find all oops.
3137
3138 oop_maps->add_gc_map( __ pc() - start, map);
3139
3140 Label noException;
3141
3142 __ reset_last_Java_frame(false);
3143
3144 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3145 __ jcc(Assembler::equal, noException);
3146
3147 // Exception pending
3148
3149 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3150
3151 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3152
3153 // No exception case
3154 __ bind(noException);
3155
3156 Label no_adjust;
3157 #ifdef ASSERT
3158 Label bail;
3159 #endif
3160 if (!cause_return) {
3161 Label no_prefix, not_special;
3162
3163 // If our stashed return pc was modified by the runtime we avoid touching it
3164 __ cmpptr(rbx, Address(rbp, wordSize));
3165 __ jccb(Assembler::notEqual, no_adjust);
3166
3167 // Skip over the poll instruction.
3168 // See NativeInstruction::is_safepoint_poll()
3169 // Possible encodings:
3170 // 85 00 test %eax,(%rax)
3171 // 85 01 test %eax,(%rcx)
3172 // 85 02 test %eax,(%rdx)
3173 // 85 03 test %eax,(%rbx)
3174 // 85 06 test %eax,(%rsi)
3175 // 85 07 test %eax,(%rdi)
3176 //
3177 // 41 85 00 test %eax,(%r8)
3178 // 41 85 01 test %eax,(%r9)
3179 // 41 85 02 test %eax,(%r10)
3180 // 41 85 03 test %eax,(%r11)
3181 // 41 85 06 test %eax,(%r14)
3182 // 41 85 07 test %eax,(%r15)
3183 //
3184 // 85 04 24 test %eax,(%rsp)
3185 // 41 85 04 24 test %eax,(%r12)
3186 // 85 45 00 test %eax,0x0(%rbp)
3187 // 41 85 45 00 test %eax,0x0(%r13)
3188
3189 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3190 __ jcc(Assembler::notEqual, no_prefix);
3191 __ addptr(rbx, 1);
3192 __ bind(no_prefix);
3193 #ifdef ASSERT
3194 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3195 #endif
3196 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3197 // r12/rsp 0x04
3198 // r13/rbp 0x05
3199 __ movzbq(rcx, Address(rbx, 1));
3200 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3201 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3202 __ cmpptr(rcx, 1);
3203 __ jcc(Assembler::above, not_special);
3204 __ addptr(rbx, 1);
3205 __ bind(not_special);
3206 #ifdef ASSERT
3207 // Verify the correct encoding of the poll we're about to skip.
3208 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3209 __ jcc(Assembler::notEqual, bail);
3210 // Mask out the modrm bits
3211 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3212 // rax encodes to 0, so if the bits are nonzero it's incorrect
3213 __ jcc(Assembler::notZero, bail);
3214 #endif
3215 // Adjust return pc forward to step over the safepoint poll instruction
3216 __ addptr(rbx, 2);
3217 __ movptr(Address(rbp, wordSize), rbx);
3218 }
3219
3220 __ bind(no_adjust);
3221 // Normal exit, restore registers and exit.
3222 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3223 __ ret(0);
3224
3225 #ifdef ASSERT
3226 __ bind(bail);
3227 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3228 #endif
3229
3230 // Make sure all code is generated
3231 masm->flush();
3232
3233 // Fill-out other meta info
3234 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3235 }
3236
3237 //
3238 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3239 //
3240 // Generate a stub that calls into vm to find out the proper destination
3241 // of a java call. All the argument registers are live at this point
3242 // but since this is generic code we don't know what they are and the caller
3243 // must do any gc of the args.
3244 //
3245 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3246 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3247
3248 // allocate space for the code
3249 ResourceMark rm;
3250
3251 CodeBuffer buffer(name, 1200, 512);
3252 MacroAssembler* masm = new MacroAssembler(&buffer);
3253
3254 int frame_size_in_words;
3255
3256 OopMapSet *oop_maps = new OopMapSet();
3257 OopMap* map = nullptr;
3258
3259 int start = __ offset();
3260
3261 // No need to save vector registers since they are caller-saved anyway.
3262 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3263
3264 int frame_complete = __ offset();
3265
3266 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3267
3268 __ mov(c_rarg0, r15_thread);
3269
3270 __ call(RuntimeAddress(destination));
3271
3272
3273 // Set an oopmap for the call site.
3274 // We need this not only for callee-saved registers, but also for volatile
3275 // registers that the compiler might be keeping live across a safepoint.
3276
3277 oop_maps->add_gc_map( __ offset() - start, map);
3278
3279 // rax contains the address we are going to jump to assuming no exception got installed
3280
3281 // clear last_Java_sp
3282 __ reset_last_Java_frame(false);
3283 // check for pending exceptions
3284 Label pending;
3285 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3286 __ jcc(Assembler::notEqual, pending);
3287
3288 // get the returned Method*
3289 __ get_vm_result_2(rbx, r15_thread);
3290 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3291
3292 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3293
3294 RegisterSaver::restore_live_registers(masm);
3295
3296 // We are back to the original state on entry and ready to go.
3297
3298 __ jmp(rax);
3299
3300 // Pending exception after the safepoint
3301
3302 __ bind(pending);
3303
3304 RegisterSaver::restore_live_registers(masm);
3305
3306 // exception pending => remove activation and forward to exception handler
3307
3308 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3309
3310 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3311 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3312
3313 // -------------
3314 // make sure all code is generated
3315 masm->flush();
3316
3317 // return the blob
3318 // frame_size_words or bytes??
3319 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3320 }
3321
3322 //------------------------------Montgomery multiplication------------------------
3323 //
3324
3325 #ifndef _WINDOWS
3326
3327 // Subtract 0:b from carry:a. Return carry.
3328 static julong
3329 sub(julong a[], julong b[], julong carry, long len) {
3330 long long i = 0, cnt = len;
3331 julong tmp;
3332 asm volatile("clc; "
3333 "0: ; "
3334 "mov (%[b], %[i], 8), %[tmp]; "
3335 "sbb %[tmp], (%[a], %[i], 8); "
3336 "inc %[i]; dec %[cnt]; "
3337 "jne 0b; "
3338 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3339 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3340 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3341 : "memory");
3342 return tmp;
3343 }
3344
3345 // Multiply (unsigned) Long A by Long B, accumulating the double-
3346 // length result into the accumulator formed of T0, T1, and T2.
3347 #define MACC(A, B, T0, T1, T2) \
3348 do { \
3349 unsigned long hi, lo; \
3350 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3351 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3352 : "r"(A), "a"(B) : "cc"); \
3353 } while(0)
3354
3355 // As above, but add twice the double-length result into the
3356 // accumulator.
3357 #define MACC2(A, B, T0, T1, T2) \
3358 do { \
3359 unsigned long hi, lo; \
3360 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3361 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3362 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3363 : "r"(A), "a"(B) : "cc"); \
3364 } while(0)
3365
3366 #else //_WINDOWS
3367
3368 static julong
3369 sub(julong a[], julong b[], julong carry, long len) {
3370 long i;
3371 julong tmp;
3372 unsigned char c = 1;
3373 for (i = 0; i < len; i++) {
3374 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3375 a[i] = tmp;
3376 }
3377 c = _addcarry_u64(c, carry, ~0, &tmp);
3378 return tmp;
3379 }
3380
3381 // Multiply (unsigned) Long A by Long B, accumulating the double-
3382 // length result into the accumulator formed of T0, T1, and T2.
3383 #define MACC(A, B, T0, T1, T2) \
3384 do { \
3385 julong hi, lo; \
3386 lo = _umul128(A, B, &hi); \
3387 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3388 c = _addcarry_u64(c, hi, T1, &T1); \
3389 _addcarry_u64(c, T2, 0, &T2); \
3390 } while(0)
3391
3392 // As above, but add twice the double-length result into the
3393 // accumulator.
3394 #define MACC2(A, B, T0, T1, T2) \
3395 do { \
3396 julong hi, lo; \
3397 lo = _umul128(A, B, &hi); \
3398 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3399 c = _addcarry_u64(c, hi, T1, &T1); \
3400 _addcarry_u64(c, T2, 0, &T2); \
3401 c = _addcarry_u64(0, lo, T0, &T0); \
3402 c = _addcarry_u64(c, hi, T1, &T1); \
3403 _addcarry_u64(c, T2, 0, &T2); \
3404 } while(0)
3405
3406 #endif //_WINDOWS
3407
3408 // Fast Montgomery multiplication. The derivation of the algorithm is
3409 // in A Cryptographic Library for the Motorola DSP56000,
3410 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3411
3412 static void NOINLINE
3413 montgomery_multiply(julong a[], julong b[], julong n[],
3414 julong m[], julong inv, int len) {
3415 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3416 int i;
3417
3418 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3419
3420 for (i = 0; i < len; i++) {
3421 int j;
3422 for (j = 0; j < i; j++) {
3423 MACC(a[j], b[i-j], t0, t1, t2);
3424 MACC(m[j], n[i-j], t0, t1, t2);
3425 }
3426 MACC(a[i], b[0], t0, t1, t2);
3427 m[i] = t0 * inv;
3428 MACC(m[i], n[0], t0, t1, t2);
3429
3430 assert(t0 == 0, "broken Montgomery multiply");
3431
3432 t0 = t1; t1 = t2; t2 = 0;
3433 }
3434
3435 for (i = len; i < 2*len; i++) {
3436 int j;
3437 for (j = i-len+1; j < len; j++) {
3438 MACC(a[j], b[i-j], t0, t1, t2);
3439 MACC(m[j], n[i-j], t0, t1, t2);
3440 }
3441 m[i-len] = t0;
3442 t0 = t1; t1 = t2; t2 = 0;
3443 }
3444
3445 while (t0)
3446 t0 = sub(m, n, t0, len);
3447 }
3448
3449 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3450 // multiplies so it should be up to 25% faster than Montgomery
3451 // multiplication. However, its loop control is more complex and it
3452 // may actually run slower on some machines.
3453
3454 static void NOINLINE
3455 montgomery_square(julong a[], julong n[],
3456 julong m[], julong inv, int len) {
3457 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3458 int i;
3459
3460 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3461
3462 for (i = 0; i < len; i++) {
3463 int j;
3464 int end = (i+1)/2;
3465 for (j = 0; j < end; j++) {
3466 MACC2(a[j], a[i-j], t0, t1, t2);
3467 MACC(m[j], n[i-j], t0, t1, t2);
3468 }
3469 if ((i & 1) == 0) {
3470 MACC(a[j], a[j], t0, t1, t2);
3471 }
3472 for (; j < i; j++) {
3473 MACC(m[j], n[i-j], t0, t1, t2);
3474 }
3475 m[i] = t0 * inv;
3476 MACC(m[i], n[0], t0, t1, t2);
3477
3478 assert(t0 == 0, "broken Montgomery square");
3479
3480 t0 = t1; t1 = t2; t2 = 0;
3481 }
3482
3483 for (i = len; i < 2*len; i++) {
3484 int start = i-len+1;
3485 int end = start + (len - start)/2;
3486 int j;
3487 for (j = start; j < end; j++) {
3488 MACC2(a[j], a[i-j], t0, t1, t2);
3489 MACC(m[j], n[i-j], t0, t1, t2);
3490 }
3491 if ((i & 1) == 0) {
3492 MACC(a[j], a[j], t0, t1, t2);
3493 }
3494 for (; j < len; j++) {
3495 MACC(m[j], n[i-j], t0, t1, t2);
3496 }
3497 m[i-len] = t0;
3498 t0 = t1; t1 = t2; t2 = 0;
3499 }
3500
3501 while (t0)
3502 t0 = sub(m, n, t0, len);
3503 }
3504
3505 // Swap words in a longword.
3506 static julong swap(julong x) {
3507 return (x << 32) | (x >> 32);
3508 }
3509
3510 // Copy len longwords from s to d, word-swapping as we go. The
3511 // destination array is reversed.
3512 static void reverse_words(julong *s, julong *d, int len) {
3513 d += len;
3514 while(len-- > 0) {
3515 d--;
3516 *d = swap(*s);
3517 s++;
3518 }
3519 }
3520
3521 // The threshold at which squaring is advantageous was determined
3522 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3523 #define MONTGOMERY_SQUARING_THRESHOLD 64
3524
3525 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3526 jint len, jlong inv,
3527 jint *m_ints) {
3528 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3529 int longwords = len/2;
3530
3531 // Make very sure we don't use so much space that the stack might
3532 // overflow. 512 jints corresponds to an 16384-bit integer and
3533 // will use here a total of 8k bytes of stack space.
3534 int divisor = sizeof(julong) * 4;
3535 guarantee(longwords <= 8192 / divisor, "must be");
3536 int total_allocation = longwords * sizeof (julong) * 4;
3537 julong *scratch = (julong *)alloca(total_allocation);
3538
3539 // Local scratch arrays
3540 julong
3541 *a = scratch + 0 * longwords,
3542 *b = scratch + 1 * longwords,
3543 *n = scratch + 2 * longwords,
3544 *m = scratch + 3 * longwords;
3545
3546 reverse_words((julong *)a_ints, a, longwords);
3547 reverse_words((julong *)b_ints, b, longwords);
3548 reverse_words((julong *)n_ints, n, longwords);
3549
3550 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3551
3552 reverse_words(m, (julong *)m_ints, longwords);
3553 }
3554
3555 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3556 jint len, jlong inv,
3557 jint *m_ints) {
3558 assert(len % 2 == 0, "array length in montgomery_square must be even");
3559 int longwords = len/2;
3560
3561 // Make very sure we don't use so much space that the stack might
3562 // overflow. 512 jints corresponds to an 16384-bit integer and
3563 // will use here a total of 6k bytes of stack space.
3564 int divisor = sizeof(julong) * 3;
3565 guarantee(longwords <= (8192 / divisor), "must be");
3566 int total_allocation = longwords * sizeof (julong) * 3;
3567 julong *scratch = (julong *)alloca(total_allocation);
3568
3569 // Local scratch arrays
3570 julong
3571 *a = scratch + 0 * longwords,
3572 *n = scratch + 1 * longwords,
3573 *m = scratch + 2 * longwords;
3574
3575 reverse_words((julong *)a_ints, a, longwords);
3576 reverse_words((julong *)n_ints, n, longwords);
3577
3578 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3579 ::montgomery_square(a, n, m, (julong)inv, longwords);
3580 } else {
3581 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3582 }
3583
3584 reverse_words(m, (julong *)m_ints, longwords);
3585 }
3586
3587 #ifdef COMPILER2
3588 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3589 //
3590 //------------------------------generate_exception_blob---------------------------
3591 // creates exception blob at the end
3592 // Using exception blob, this code is jumped from a compiled method.
3593 // (see emit_exception_handler in x86_64.ad file)
3594 //
3595 // Given an exception pc at a call we call into the runtime for the
3596 // handler in this method. This handler might merely restore state
3597 // (i.e. callee save registers) unwind the frame and jump to the
3598 // exception handler for the nmethod if there is no Java level handler
3599 // for the nmethod.
3600 //
3601 // This code is entered with a jmp.
3602 //
3603 // Arguments:
3604 // rax: exception oop
3605 // rdx: exception pc
3606 //
3607 // Results:
3608 // rax: exception oop
3609 // rdx: exception pc in caller or ???
3610 // destination: exception handler of caller
3611 //
3612 // Note: the exception pc MUST be at a call (precise debug information)
3613 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3614 //
3615
3616 void OptoRuntime::generate_exception_blob() {
3617 assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3618 assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3619 assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3620
3621 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3622
3623 // Allocate space for the code
3624 ResourceMark rm;
3625 // Setup code generation tools
3626 CodeBuffer buffer("exception_blob", 2048, 1024);
3627 MacroAssembler* masm = new MacroAssembler(&buffer);
3628
3629
3630 address start = __ pc();
3631
3632 // Exception pc is 'return address' for stack walker
3633 __ push(rdx);
3634 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3635
3636 // Save callee-saved registers. See x86_64.ad.
3637
3638 // rbp is an implicitly saved callee saved register (i.e., the calling
3639 // convention will save/restore it in the prolog/epilog). Other than that
3640 // there are no callee save registers now that adapter frames are gone.
3641
3642 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3643
3644 // Store exception in Thread object. We cannot pass any arguments to the
3645 // handle_exception call, since we do not want to make any assumption
3646 // about the size of the frame where the exception happened in.
3647 // c_rarg0 is either rdi (Linux) or rcx (Windows).
3648 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3649 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3650
3651 // This call does all the hard work. It checks if an exception handler
3652 // exists in the method.
3653 // If so, it returns the handler address.
3654 // If not, it prepares for stack-unwinding, restoring the callee-save
3655 // registers of the frame being removed.
3656 //
3657 // address OptoRuntime::handle_exception_C(JavaThread* thread)
3658
3659 // At a method handle call, the stack may not be properly aligned
3660 // when returning with an exception.
3661 address the_pc = __ pc();
3662 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3663 __ mov(c_rarg0, r15_thread);
3664 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3666
3667 // Set an oopmap for the call site. This oopmap will only be used if we
3668 // are unwinding the stack. Hence, all locations will be dead.
3669 // Callee-saved registers will be the same as the frame above (i.e.,
3670 // handle_exception_stub), since they were restored when we got the
3671 // exception.
3672
3673 OopMapSet* oop_maps = new OopMapSet();
3674
3675 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3676
3677 __ reset_last_Java_frame(false);
3678
3679 // Restore callee-saved registers
3680
3681 // rbp is an implicitly saved callee-saved register (i.e., the calling
3682 // convention will save restore it in prolog/epilog) Other than that
3683 // there are no callee save registers now that adapter frames are gone.
3684
3685 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3686
3687 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3688 __ pop(rdx); // No need for exception pc anymore
3689
3690 // rax: exception handler
3691
3692 // We have a handler in rax (could be deopt blob).
3693 __ mov(r8, rax);
3694
3695 // Get the exception oop
3696 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3697 // Get the exception pc in case we are deoptimized
3698 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3699 #ifdef ASSERT
3700 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3701 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3702 #endif
3703 // Clear the exception oop so GC no longer processes it as a root.
3704 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3705
3706 // rax: exception oop
3707 // r8: exception handler
3708 // rdx: exception pc
3709 // Jump to handler
3710
3711 __ jmp(r8);
3712
3713 // Make sure all code is generated
3714 masm->flush();
3715
3716 // Set exception blob
3717 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3718 }
3719 #endif // COMPILER2
3720