1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/method.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/globals.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/safepointMechanism.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/signature.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "runtime/timerTrace.hpp"
56 #include "runtime/vframeArray.hpp"
57 #include "runtime/vm_version.hpp"
58 #include "utilities/align.hpp"
59 #include "utilities/checkedCast.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68 #if INCLUDE_JVMCI
69 #include "jvmci/jvmciJavaClasses.hpp"
70 #endif
71
72 #define __ masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif // PRODUCT
79
80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
81
82 class RegisterSaver {
83 // Capture info about frame layout. Layout offsets are in jint
84 // units because compiler frame slots are jints.
85 #define XSAVE_AREA_BEGIN 160
86 #define XSAVE_AREA_YMM_BEGIN 576
87 #define XSAVE_AREA_EGPRS 960
88 #define XSAVE_AREA_OPMASK_BEGIN 1088
89 #define XSAVE_AREA_ZMM_BEGIN 1152
90 #define XSAVE_AREA_UPPERBANK 1664
91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
96 enum layout {
97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
99 DEF_XMM_OFFS(0),
100 DEF_XMM_OFFS(1),
101 // 2..15 are implied in range usage
102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
103 DEF_YMM_OFFS(0),
104 DEF_YMM_OFFS(1),
105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
106 r16H_off,
107 r17_off, r17H_off,
108 r18_off, r18H_off,
109 r19_off, r19H_off,
110 r20_off, r20H_off,
111 r21_off, r21H_off,
112 r22_off, r22H_off,
113 r23_off, r23H_off,
114 r24_off, r24H_off,
115 r25_off, r25H_off,
116 r26_off, r26H_off,
117 r27_off, r27H_off,
118 r28_off, r28H_off,
119 r29_off, r29H_off,
120 r30_off, r30H_off,
121 r31_off, r31H_off,
122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
123 DEF_OPMASK_OFFS(0),
124 DEF_OPMASK_OFFS(1),
125 // 2..7 are implied in range usage
126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
127 DEF_ZMM_OFFS(0),
128 DEF_ZMM_OFFS(1),
129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
130 DEF_ZMM_UPPER_OFFS(16),
131 DEF_ZMM_UPPER_OFFS(17),
132 // 18..31 are implied in range usage
133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
134 fpu_stateH_end,
135 r15_off, r15H_off,
136 r14_off, r14H_off,
137 r13_off, r13H_off,
138 r12_off, r12H_off,
139 r11_off, r11H_off,
140 r10_off, r10H_off,
141 r9_off, r9H_off,
142 r8_off, r8H_off,
143 rdi_off, rdiH_off,
144 rsi_off, rsiH_off,
145 ignore_off, ignoreH_off, // extra copy of rbp
146 rsp_off, rspH_off,
147 rbx_off, rbxH_off,
148 rdx_off, rdxH_off,
149 rcx_off, rcxH_off,
150 rax_off, raxH_off,
151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
152 align_off, alignH_off,
153 flags_off, flagsH_off,
154 // The frame sender code expects that rbp will be in the "natural" place and
155 // will override any oopMap setting for it. We must therefore force the layout
156 // so that it agrees with the frame sender code.
157 rbp_off, rbpH_off, // copy of rbp we will restore
158 return_off, returnH_off, // slot for return address
159 reg_save_size // size in compiler stack slots
160 };
161
162 public:
163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
165
166 // Offsets into the register save area
167 // Used by deoptimization when it is managing result register
168 // values on its own
169
170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
176
177 // During deoptimization only the result registers need to be restored,
178 // all the other values have already been extracted.
179 static void restore_result_registers(MacroAssembler* masm);
180 };
181
182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
183 int off = 0;
184 int num_xmm_regs = XMMRegister::available_xmm_registers();
185 #if COMPILER2_OR_JVMCI
186 if (save_wide_vectors && UseAVX == 0) {
187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
188 }
189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
190 #else
191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
192 #endif
193
194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
198 // CodeBlob frame size is in words.
199 int frame_size_in_words = frame_size_in_bytes / wordSize;
200 *total_frame_words = frame_size_in_words;
201
202 // Save registers, fpu state, and flags.
203 // We assume caller has already pushed the return address onto the
204 // stack, so rsp is 8-byte aligned here.
205 // We push rpb twice in this sequence because we want the real rbp
206 // to be under the return like a normal enter.
207
208 __ enter(); // rsp becomes 16-byte aligned here
209 __ pushf();
210 // Make sure rsp stays 16-byte aligned
211 __ subq(rsp, 8);
212 // Push CPU state in multiple of 16 bytes
213 __ save_legacy_gprs();
214 __ push_FPU_state();
215
216
217 // push cpu state handles this on EVEX enabled targets
218 if (save_wide_vectors) {
219 // Save upper half of YMM registers(0..15)
220 int base_addr = XSAVE_AREA_YMM_BEGIN;
221 for (int n = 0; n < 16; n++) {
222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
223 }
224 if (VM_Version::supports_evex()) {
225 // Save upper half of ZMM registers(0..15)
226 base_addr = XSAVE_AREA_ZMM_BEGIN;
227 for (int n = 0; n < 16; n++) {
228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
229 }
230 // Save full ZMM registers(16..num_xmm_regs)
231 base_addr = XSAVE_AREA_UPPERBANK;
232 off = 0;
233 int vector_len = Assembler::AVX_512bit;
234 for (int n = 16; n < num_xmm_regs; n++) {
235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
236 }
237 #if COMPILER2_OR_JVMCI
238 base_addr = XSAVE_AREA_OPMASK_BEGIN;
239 off = 0;
240 for(int n = 0; n < KRegister::number_of_registers; n++) {
241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
242 }
243 #endif
244 }
245 } else {
246 if (VM_Version::supports_evex()) {
247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
248 int base_addr = XSAVE_AREA_UPPERBANK;
249 off = 0;
250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
251 for (int n = 16; n < num_xmm_regs; n++) {
252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
253 }
254 #if COMPILER2_OR_JVMCI
255 base_addr = XSAVE_AREA_OPMASK_BEGIN;
256 off = 0;
257 for(int n = 0; n < KRegister::number_of_registers; n++) {
258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
259 }
260 #endif
261 }
262 }
263
264 #if COMPILER2_OR_JVMCI
265 if (UseAPX) {
266 int base_addr = XSAVE_AREA_EGPRS;
267 off = 0;
268 for (int n = 16; n < Register::number_of_registers; n++) {
269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
270 }
271 }
272 #endif
273
274 __ vzeroupper();
275 if (frame::arg_reg_save_area_bytes != 0) {
276 // Allocate argument register save area
277 __ subptr(rsp, frame::arg_reg_save_area_bytes);
278 }
279
280 // Set an oopmap for the call site. This oopmap will map all
281 // oop-registers and debug-info registers as callee-saved. This
282 // will allow deoptimization at this safepoint to find all possible
283 // debug-info recordings, as well as let GC find all oops.
284
285 OopMapSet *oop_maps = new OopMapSet();
286 OopMap* map = new OopMap(frame_size_in_slots, 0);
287
288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
289
290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
294 // rbp location is known implicitly by the frame sender code, needs no oopmap
295 // and the location where rbp was saved by is ignored
296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
306
307 if (UseAPX) {
308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
324 }
325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
326 // on EVEX enabled targets, we get it included in the xsave area
327 off = xmm0_off;
328 int delta = xmm1_off - off;
329 for (int n = 0; n < 16; n++) {
330 XMMRegister xmm_name = as_XMMRegister(n);
331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
332 off += delta;
333 }
334 if (UseAVX > 2) {
335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
336 off = zmm16_off;
337 delta = zmm17_off - off;
338 for (int n = 16; n < num_xmm_regs; n++) {
339 XMMRegister zmm_name = as_XMMRegister(n);
340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
341 off += delta;
342 }
343 }
344
345 #if COMPILER2_OR_JVMCI
346 if (save_wide_vectors) {
347 // Save upper half of YMM registers(0..15)
348 off = ymm0_off;
349 delta = ymm1_off - ymm0_off;
350 for (int n = 0; n < 16; n++) {
351 XMMRegister ymm_name = as_XMMRegister(n);
352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
353 off += delta;
354 }
355 if (VM_Version::supports_evex()) {
356 // Save upper half of ZMM registers(0..15)
357 off = zmm0_off;
358 delta = zmm1_off - zmm0_off;
359 for (int n = 0; n < 16; n++) {
360 XMMRegister zmm_name = as_XMMRegister(n);
361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
362 off += delta;
363 }
364 }
365 }
366 #endif // COMPILER2_OR_JVMCI
367
368 // %%% These should all be a waste but we'll keep things as they were for now
369 if (true) {
370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
374 // rbp location is known implicitly by the frame sender code, needs no oopmap
375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
385 if (UseAPX) {
386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
402 }
403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
404 // on EVEX enabled targets, we get it included in the xsave area
405 off = xmm0H_off;
406 delta = xmm1H_off - off;
407 for (int n = 0; n < 16; n++) {
408 XMMRegister xmm_name = as_XMMRegister(n);
409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
410 off += delta;
411 }
412 if (UseAVX > 2) {
413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
414 off = zmm16H_off;
415 delta = zmm17H_off - off;
416 for (int n = 16; n < num_xmm_regs; n++) {
417 XMMRegister zmm_name = as_XMMRegister(n);
418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
419 off += delta;
420 }
421 }
422 }
423
424 return map;
425 }
426
427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
428 int num_xmm_regs = XMMRegister::available_xmm_registers();
429 if (frame::arg_reg_save_area_bytes != 0) {
430 // Pop arg register save area
431 __ addptr(rsp, frame::arg_reg_save_area_bytes);
432 }
433
434 #if COMPILER2_OR_JVMCI
435 if (restore_wide_vectors) {
436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
438 }
439 #else
440 assert(!restore_wide_vectors, "vectors are generated only by C2");
441 #endif
442
443 __ vzeroupper();
444
445 // On EVEX enabled targets everything is handled in pop fpu state
446 if (restore_wide_vectors) {
447 // Restore upper half of YMM registers (0..15)
448 int base_addr = XSAVE_AREA_YMM_BEGIN;
449 for (int n = 0; n < 16; n++) {
450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
451 }
452 if (VM_Version::supports_evex()) {
453 // Restore upper half of ZMM registers (0..15)
454 base_addr = XSAVE_AREA_ZMM_BEGIN;
455 for (int n = 0; n < 16; n++) {
456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
457 }
458 // Restore full ZMM registers(16..num_xmm_regs)
459 base_addr = XSAVE_AREA_UPPERBANK;
460 int vector_len = Assembler::AVX_512bit;
461 int off = 0;
462 for (int n = 16; n < num_xmm_regs; n++) {
463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
464 }
465 #if COMPILER2_OR_JVMCI
466 base_addr = XSAVE_AREA_OPMASK_BEGIN;
467 off = 0;
468 for (int n = 0; n < KRegister::number_of_registers; n++) {
469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
470 }
471 #endif
472 }
473 } else {
474 if (VM_Version::supports_evex()) {
475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
476 int base_addr = XSAVE_AREA_UPPERBANK;
477 int off = 0;
478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
479 for (int n = 16; n < num_xmm_regs; n++) {
480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
481 }
482 #if COMPILER2_OR_JVMCI
483 base_addr = XSAVE_AREA_OPMASK_BEGIN;
484 off = 0;
485 for (int n = 0; n < KRegister::number_of_registers; n++) {
486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
487 }
488 #endif
489 }
490 }
491
492 #if COMPILER2_OR_JVMCI
493 if (UseAPX) {
494 int base_addr = XSAVE_AREA_EGPRS;
495 int off = 0;
496 for (int n = 16; n < Register::number_of_registers; n++) {
497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
498 }
499 }
500 #endif
501
502 // Recover CPU state
503 __ pop_FPU_state();
504 __ restore_legacy_gprs();
505 __ addq(rsp, 8);
506 __ popf();
507 // Get the rbp described implicitly by the calling convention (no oopMap)
508 __ pop(rbp);
509 }
510
511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
512
513 // Just restore result register. Only used by deoptimization. By
514 // now any callee save register that needs to be restored to a c2
515 // caller of the deoptee has been extracted into the vframeArray
516 // and will be stuffed into the c2i adapter we create for later
517 // restoration so only result registers need to be restored here.
518
519 // Restore fp result register
520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
521 // Restore integer result register
522 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
524
525 // Pop all of the register save are off the stack except the return address
526 __ addptr(rsp, return_offset_in_bytes());
527 }
528
529 // Is vector's size (in bytes) bigger than a size saved by default?
530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
531 bool SharedRuntime::is_wide_vector(int size) {
532 return size > 16;
533 }
534
535 // ---------------------------------------------------------------------------
536 // Read the array of BasicTypes from a signature, and compute where the
537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
538 // quantities. Values less than VMRegImpl::stack0 are registers, those above
539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
540 // as framesizes are fixed.
541 // VMRegImpl::stack0 refers to the first slot 0(sp).
542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
543 // Register up to Register::number_of_registers are the 64-bit
544 // integer registers.
545
546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
548 // units regardless of build. Of course for i486 there is no 64 bit build
549
550 // The Java calling convention is a "shifted" version of the C ABI.
551 // By skipping the first C ABI register we can call non-static jni methods
552 // with small numbers of arguments without having to shuffle the arguments
553 // at all. Since we control the java ABI we ought to at least get some
554 // advantage out of it.
555
556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
557 VMRegPair *regs,
558 int total_args_passed) {
559
560 // Create the mapping between argument positions and
561 // registers.
562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
564 };
565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
566 j_farg0, j_farg1, j_farg2, j_farg3,
567 j_farg4, j_farg5, j_farg6, j_farg7
568 };
569
570
571 uint int_args = 0;
572 uint fp_args = 0;
573 uint stk_args = 0;
574
575 for (int i = 0; i < total_args_passed; i++) {
576 switch (sig_bt[i]) {
577 case T_BOOLEAN:
578 case T_CHAR:
579 case T_BYTE:
580 case T_SHORT:
581 case T_INT:
582 if (int_args < Argument::n_int_register_parameters_j) {
583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
584 } else {
585 stk_args = align_up(stk_args, 2);
586 regs[i].set1(VMRegImpl::stack2reg(stk_args));
587 stk_args += 1;
588 }
589 break;
590 case T_VOID:
591 // halves of T_LONG or T_DOUBLE
592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
593 regs[i].set_bad();
594 break;
595 case T_LONG:
596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
597 // fall through
598 case T_OBJECT:
599 case T_ARRAY:
600 case T_ADDRESS:
601 if (int_args < Argument::n_int_register_parameters_j) {
602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
603 } else {
604 stk_args = align_up(stk_args, 2);
605 regs[i].set2(VMRegImpl::stack2reg(stk_args));
606 stk_args += 2;
607 }
608 break;
609 case T_FLOAT:
610 if (fp_args < Argument::n_float_register_parameters_j) {
611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
612 } else {
613 stk_args = align_up(stk_args, 2);
614 regs[i].set1(VMRegImpl::stack2reg(stk_args));
615 stk_args += 1;
616 }
617 break;
618 case T_DOUBLE:
619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
620 if (fp_args < Argument::n_float_register_parameters_j) {
621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
622 } else {
623 stk_args = align_up(stk_args, 2);
624 regs[i].set2(VMRegImpl::stack2reg(stk_args));
625 stk_args += 2;
626 }
627 break;
628 default:
629 ShouldNotReachHere();
630 break;
631 }
632 }
633
634 return stk_args;
635 }
636
637 // Patch the callers callsite with entry to compiled code if it exists.
638 static void patch_callers_callsite(MacroAssembler *masm) {
639 Label L;
640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
641 __ jcc(Assembler::equal, L);
642
643 // Save the current stack pointer
644 __ mov(r13, rsp);
645 // Schedule the branch target address early.
646 // Call into the VM to patch the caller, then jump to compiled callee
647 // rax isn't live so capture return address while we easily can
648 __ movptr(rax, Address(rsp, 0));
649
650 // align stack so push_CPU_state doesn't fault
651 __ andptr(rsp, -(StackAlignmentInBytes));
652 __ push_CPU_state();
653 __ vzeroupper();
654 // VM needs caller's callsite
655 // VM needs target method
656 // This needs to be a long call since we will relocate this adapter to
657 // the codeBuffer and it may not reach
658
659 // Allocate argument register save area
660 if (frame::arg_reg_save_area_bytes != 0) {
661 __ subptr(rsp, frame::arg_reg_save_area_bytes);
662 }
663 __ mov(c_rarg0, rbx);
664 __ mov(c_rarg1, rax);
665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
666
667 // De-allocate argument register save area
668 if (frame::arg_reg_save_area_bytes != 0) {
669 __ addptr(rsp, frame::arg_reg_save_area_bytes);
670 }
671
672 __ vzeroupper();
673 __ pop_CPU_state();
674 // restore sp
675 __ mov(rsp, r13);
676 __ bind(L);
677 }
678
679 static void gen_c2i_adapter(MacroAssembler *masm,
680 int total_args_passed,
681 int comp_args_on_stack,
682 const BasicType *sig_bt,
683 const VMRegPair *regs,
684 Label& skip_fixup) {
685 // Before we get into the guts of the C2I adapter, see if we should be here
686 // at all. We've come from compiled code and are attempting to jump to the
687 // interpreter, which means the caller made a static call to get here
688 // (vcalls always get a compiled target if there is one). Check for a
689 // compiled target. If there is one, we need to patch the caller's call.
690 patch_callers_callsite(masm);
691
692 __ bind(skip_fixup);
693
694 // Since all args are passed on the stack, total_args_passed *
695 // Interpreter::stackElementSize is the space we need.
696
697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
698
699 int extraspace = (total_args_passed * Interpreter::stackElementSize);
700
701 // stack is aligned, keep it that way
702 // This is not currently needed or enforced by the interpreter, but
703 // we might as well conform to the ABI.
704 extraspace = align_up(extraspace, 2*wordSize);
705
706 // set senderSP value
707 __ lea(r13, Address(rsp, wordSize));
708
709 #ifdef ASSERT
710 __ check_stack_alignment(r13, "sender stack not aligned");
711 #endif
712 if (extraspace > 0) {
713 // Pop the return address
714 __ pop(rax);
715
716 __ subptr(rsp, extraspace);
717
718 // Push the return address
719 __ push(rax);
720
721 // Account for the return address location since we store it first rather
722 // than hold it in a register across all the shuffling
723 extraspace += wordSize;
724 }
725
726 #ifdef ASSERT
727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
728 #endif
729
730 // Now write the args into the outgoing interpreter space
731 for (int i = 0; i < total_args_passed; i++) {
732 if (sig_bt[i] == T_VOID) {
733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
734 continue;
735 }
736
737 // offset to start parameters
738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
739 int next_off = st_off - Interpreter::stackElementSize;
740
741 // Say 4 args:
742 // i st_off
743 // 0 32 T_LONG
744 // 1 24 T_VOID
745 // 2 16 T_OBJECT
746 // 3 8 T_BOOL
747 // - 0 return address
748 //
749 // However to make thing extra confusing. Because we can fit a long/double in
750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
751 // leaves one slot empty and only stores to a single slot. In this case the
752 // slot that is occupied is the T_VOID slot. See I said it was confusing.
753
754 VMReg r_1 = regs[i].first();
755 VMReg r_2 = regs[i].second();
756 if (!r_1->is_valid()) {
757 assert(!r_2->is_valid(), "");
758 continue;
759 }
760 if (r_1->is_stack()) {
761 // memory to memory use rax
762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
763 if (!r_2->is_valid()) {
764 // sign extend??
765 __ movl(rax, Address(rsp, ld_off));
766 __ movptr(Address(rsp, st_off), rax);
767
768 } else {
769
770 __ movq(rax, Address(rsp, ld_off));
771
772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
773 // T_DOUBLE and T_LONG use two slots in the interpreter
774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
775 // ld_off == LSW, ld_off+wordSize == MSW
776 // st_off == MSW, next_off == LSW
777 __ movq(Address(rsp, next_off), rax);
778 #ifdef ASSERT
779 // Overwrite the unused slot with known junk
780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
781 __ movptr(Address(rsp, st_off), rax);
782 #endif /* ASSERT */
783 } else {
784 __ movq(Address(rsp, st_off), rax);
785 }
786 }
787 } else if (r_1->is_Register()) {
788 Register r = r_1->as_Register();
789 if (!r_2->is_valid()) {
790 // must be only an int (or less ) so move only 32bits to slot
791 // why not sign extend??
792 __ movl(Address(rsp, st_off), r);
793 } else {
794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
795 // T_DOUBLE and T_LONG use two slots in the interpreter
796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
797 // long/double in gpr
798 #ifdef ASSERT
799 // Overwrite the unused slot with known junk
800 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
801 __ movptr(Address(rsp, st_off), rax);
802 #endif /* ASSERT */
803 __ movq(Address(rsp, next_off), r);
804 } else {
805 __ movptr(Address(rsp, st_off), r);
806 }
807 }
808 } else {
809 assert(r_1->is_XMMRegister(), "");
810 if (!r_2->is_valid()) {
811 // only a float use just part of the slot
812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
813 } else {
814 #ifdef ASSERT
815 // Overwrite the unused slot with known junk
816 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
817 __ movptr(Address(rsp, st_off), rax);
818 #endif /* ASSERT */
819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
820 }
821 }
822 }
823
824 // Schedule the branch target address early.
825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
826 __ jmp(rcx);
827 }
828
829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
830 int total_args_passed,
831 int comp_args_on_stack,
832 const BasicType *sig_bt,
833 const VMRegPair *regs) {
834
835 // Note: r13 contains the senderSP on entry. We must preserve it since
836 // we may do a i2c -> c2i transition if we lose a race where compiled
837 // code goes non-entrant while we get args ready.
838 // In addition we use r13 to locate all the interpreter args as
839 // we must align the stack to 16 bytes on an i2c entry else we
840 // lose alignment we expect in all compiled code and register
841 // save code can segv when fxsave instructions find improperly
842 // aligned stack pointer.
843
844 // Adapters can be frameless because they do not require the caller
845 // to perform additional cleanup work, such as correcting the stack pointer.
846 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
847 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
848 // even if a callee has modified the stack pointer.
849 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
850 // routinely repairs its caller's stack pointer (from sender_sp, which is set
851 // up via the senderSP register).
852 // In other words, if *either* the caller or callee is interpreted, we can
853 // get the stack pointer repaired after a call.
854 // This is why c2i and i2c adapters cannot be indefinitely composed.
855 // In particular, if a c2i adapter were to somehow call an i2c adapter,
856 // both caller and callee would be compiled methods, and neither would
857 // clean up the stack pointer changes performed by the two adapters.
858 // If this happens, control eventually transfers back to the compiled
859 // caller, but with an uncorrected stack, causing delayed havoc.
860
861 // Must preserve original SP for loading incoming arguments because
862 // we need to align the outgoing SP for compiled code.
863 __ movptr(r11, rsp);
864
865 // Pick up the return address
866 __ pop(rax);
867
868 // Convert 4-byte c2 stack slots to words.
869 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
870
871 if (comp_args_on_stack) {
872 __ subptr(rsp, comp_words_on_stack * wordSize);
873 }
874
875 // Ensure compiled code always sees stack at proper alignment
876 __ andptr(rsp, -16);
877
878 // push the return address and misalign the stack that youngest frame always sees
879 // as far as the placement of the call instruction
880 __ push(rax);
881
882 // Put saved SP in another register
883 const Register saved_sp = rax;
884 __ movptr(saved_sp, r11);
885
886 // Will jump to the compiled code just as if compiled code was doing it.
887 // Pre-load the register-jump target early, to schedule it better.
888 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
889
890 #if INCLUDE_JVMCI
891 if (EnableJVMCI) {
892 // check if this call should be routed towards a specific entry point
893 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
894 Label no_alternative_target;
895 __ jcc(Assembler::equal, no_alternative_target);
896 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
897 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
898 __ bind(no_alternative_target);
899 }
900 #endif // INCLUDE_JVMCI
901
902 // Now generate the shuffle code. Pick up all register args and move the
903 // rest through the floating point stack top.
904 for (int i = 0; i < total_args_passed; i++) {
905 if (sig_bt[i] == T_VOID) {
906 // Longs and doubles are passed in native word order, but misaligned
907 // in the 32-bit build.
908 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
909 continue;
910 }
911
912 // Pick up 0, 1 or 2 words from SP+offset.
913
914 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
915 "scrambled load targets?");
916 // Load in argument order going down.
917 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
918 // Point to interpreter value (vs. tag)
919 int next_off = ld_off - Interpreter::stackElementSize;
920 //
921 //
922 //
923 VMReg r_1 = regs[i].first();
924 VMReg r_2 = regs[i].second();
925 if (!r_1->is_valid()) {
926 assert(!r_2->is_valid(), "");
927 continue;
928 }
929 if (r_1->is_stack()) {
930 // Convert stack slot to an SP offset (+ wordSize to account for return address )
931 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
932
933 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
934 // and if we end up going thru a c2i because of a miss a reasonable value of r13
935 // will be generated.
936 if (!r_2->is_valid()) {
937 // sign extend???
938 __ movl(r13, Address(saved_sp, ld_off));
939 __ movptr(Address(rsp, st_off), r13);
940 } else {
941 //
942 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
943 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
944 // So we must adjust where to pick up the data to match the interpreter.
945 //
946 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
947 // are accessed as negative so LSW is at LOW address
948
949 // ld_off is MSW so get LSW
950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
951 next_off : ld_off;
952 __ movq(r13, Address(saved_sp, offset));
953 // st_off is LSW (i.e. reg.first())
954 __ movq(Address(rsp, st_off), r13);
955 }
956 } else if (r_1->is_Register()) { // Register argument
957 Register r = r_1->as_Register();
958 assert(r != rax, "must be different");
959 if (r_2->is_valid()) {
960 //
961 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
962 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
963 // So we must adjust where to pick up the data to match the interpreter.
964
965 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
966 next_off : ld_off;
967
968 // this can be a misaligned move
969 __ movq(r, Address(saved_sp, offset));
970 } else {
971 // sign extend and use a full word?
972 __ movl(r, Address(saved_sp, ld_off));
973 }
974 } else {
975 if (!r_2->is_valid()) {
976 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
977 } else {
978 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
979 }
980 }
981 }
982
983 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
984
985 // 6243940 We might end up in handle_wrong_method if
986 // the callee is deoptimized as we race thru here. If that
987 // happens we don't want to take a safepoint because the
988 // caller frame will look interpreted and arguments are now
989 // "compiled" so it is much better to make this transition
990 // invisible to the stack walking code. Unfortunately if
991 // we try and find the callee by normal means a safepoint
992 // is possible. So we stash the desired callee in the thread
993 // and the vm will find there should this case occur.
994
995 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
996
997 // put Method* where a c2i would expect should we end up there
998 // only needed because eof c2 resolve stubs return Method* as a result in
999 // rax
1000 __ mov(rax, rbx);
1001 __ jmp(r11);
1002 }
1003
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006 int total_args_passed,
1007 int comp_args_on_stack,
1008 const BasicType *sig_bt,
1009 const VMRegPair *regs,
1010 AdapterHandlerEntry* handler) {
1011 address i2c_entry = __ pc();
1012
1013 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014
1015 // -------------------------------------------------------------------------
1016 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1017 // to the interpreter. The args start out packed in the compiled layout. They
1018 // need to be unpacked into the interpreter layout. This will almost always
1019 // require some stack space. We grow the current (compiled) stack, then repack
1020 // the args. We finally end in a jump to the generic interpreter entry point.
1021 // On exit from the interpreter, the interpreter will restore our SP (lest the
1022 // compiled code, which relies solely on SP and not RBP, get sick).
1023
1024 address c2i_unverified_entry = __ pc();
1025 Label skip_fixup;
1026
1027 Register data = rax;
1028 Register receiver = j_rarg0;
1029 Register temp = rbx;
1030
1031 {
1032 __ ic_check(1 /* end_alignment */);
1033 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034 // Method might have been compiled since the call site was patched to
1035 // interpreted if that is the case treat it as a miss so we can get
1036 // the call site corrected.
1037 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038 __ jcc(Assembler::equal, skip_fixup);
1039 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040 }
1041
1042 address c2i_entry = __ pc();
1043
1044 // Class initialization barrier for static methods
1045 address c2i_no_clinit_check_entry = nullptr;
1046 if (VM_Version::supports_fast_class_init_checks()) {
1047 Label L_skip_barrier;
1048 Register method = rbx;
1049
1050 { // Bypass the barrier for non-static methods
1051 Register flags = rscratch1;
1052 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053 __ testl(flags, JVM_ACC_STATIC);
1054 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055 }
1056
1057 Register klass = rscratch1;
1058 __ load_method_holder(klass, method);
1059 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1060
1061 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1062
1063 __ bind(L_skip_barrier);
1064 c2i_no_clinit_check_entry = __ pc();
1065 }
1066
1067 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1068 bs->c2i_entry_barrier(masm);
1069
1070 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1071
1072 handler->set_entry_points(i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1073 return;
1074 }
1075
1076 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1077 VMRegPair *regs,
1078 int total_args_passed) {
1079
1080 // We return the amount of VMRegImpl stack slots we need to reserve for all
1081 // the arguments NOT counting out_preserve_stack_slots.
1082
1083 // NOTE: These arrays will have to change when c1 is ported
1084 #ifdef _WIN64
1085 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1086 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1087 };
1088 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1089 c_farg0, c_farg1, c_farg2, c_farg3
1090 };
1091 #else
1092 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1093 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1094 };
1095 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1096 c_farg0, c_farg1, c_farg2, c_farg3,
1097 c_farg4, c_farg5, c_farg6, c_farg7
1098 };
1099 #endif // _WIN64
1100
1101
1102 uint int_args = 0;
1103 uint fp_args = 0;
1104 uint stk_args = 0; // inc by 2 each time
1105
1106 for (int i = 0; i < total_args_passed; i++) {
1107 switch (sig_bt[i]) {
1108 case T_BOOLEAN:
1109 case T_CHAR:
1110 case T_BYTE:
1111 case T_SHORT:
1112 case T_INT:
1113 if (int_args < Argument::n_int_register_parameters_c) {
1114 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116 fp_args++;
1117 // Allocate slots for callee to stuff register args the stack.
1118 stk_args += 2;
1119 #endif
1120 } else {
1121 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1122 stk_args += 2;
1123 }
1124 break;
1125 case T_LONG:
1126 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1127 // fall through
1128 case T_OBJECT:
1129 case T_ARRAY:
1130 case T_ADDRESS:
1131 case T_METADATA:
1132 if (int_args < Argument::n_int_register_parameters_c) {
1133 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1134 #ifdef _WIN64
1135 fp_args++;
1136 stk_args += 2;
1137 #endif
1138 } else {
1139 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140 stk_args += 2;
1141 }
1142 break;
1143 case T_FLOAT:
1144 if (fp_args < Argument::n_float_register_parameters_c) {
1145 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1146 #ifdef _WIN64
1147 int_args++;
1148 // Allocate slots for callee to stuff register args the stack.
1149 stk_args += 2;
1150 #endif
1151 } else {
1152 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1153 stk_args += 2;
1154 }
1155 break;
1156 case T_DOUBLE:
1157 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1158 if (fp_args < Argument::n_float_register_parameters_c) {
1159 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1160 #ifdef _WIN64
1161 int_args++;
1162 // Allocate slots for callee to stuff register args the stack.
1163 stk_args += 2;
1164 #endif
1165 } else {
1166 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1167 stk_args += 2;
1168 }
1169 break;
1170 case T_VOID: // Halves of longs and doubles
1171 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1172 regs[i].set_bad();
1173 break;
1174 default:
1175 ShouldNotReachHere();
1176 break;
1177 }
1178 }
1179 #ifdef _WIN64
1180 // windows abi requires that we always allocate enough stack space
1181 // for 4 64bit registers to be stored down.
1182 if (stk_args < 8) {
1183 stk_args = 8;
1184 }
1185 #endif // _WIN64
1186
1187 return stk_args;
1188 }
1189
1190 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1191 uint num_bits,
1192 uint total_args_passed) {
1193 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1194 "only certain vector sizes are supported for now");
1195
1196 static const XMMRegister VEC_ArgReg[32] = {
1197 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1198 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1199 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1200 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1201 };
1202
1203 uint stk_args = 0;
1204 uint fp_args = 0;
1205
1206 for (uint i = 0; i < total_args_passed; i++) {
1207 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1208 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1209 regs[i].set_pair(vmreg->next(next_val), vmreg);
1210 }
1211
1212 return stk_args;
1213 }
1214
1215 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1216 // We always ignore the frame_slots arg and just use the space just below frame pointer
1217 // which by this time is free to use
1218 switch (ret_type) {
1219 case T_FLOAT:
1220 __ movflt(Address(rbp, -wordSize), xmm0);
1221 break;
1222 case T_DOUBLE:
1223 __ movdbl(Address(rbp, -wordSize), xmm0);
1224 break;
1225 case T_VOID: break;
1226 default: {
1227 __ movptr(Address(rbp, -wordSize), rax);
1228 }
1229 }
1230 }
1231
1232 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1233 // We always ignore the frame_slots arg and just use the space just below frame pointer
1234 // which by this time is free to use
1235 switch (ret_type) {
1236 case T_FLOAT:
1237 __ movflt(xmm0, Address(rbp, -wordSize));
1238 break;
1239 case T_DOUBLE:
1240 __ movdbl(xmm0, Address(rbp, -wordSize));
1241 break;
1242 case T_VOID: break;
1243 default: {
1244 __ movptr(rax, Address(rbp, -wordSize));
1245 }
1246 }
1247 }
1248
1249 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1250 for ( int i = first_arg ; i < arg_count ; i++ ) {
1251 if (args[i].first()->is_Register()) {
1252 __ push(args[i].first()->as_Register());
1253 } else if (args[i].first()->is_XMMRegister()) {
1254 __ subptr(rsp, 2*wordSize);
1255 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1256 }
1257 }
1258 }
1259
1260 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1261 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1262 if (args[i].first()->is_Register()) {
1263 __ pop(args[i].first()->as_Register());
1264 } else if (args[i].first()->is_XMMRegister()) {
1265 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1266 __ addptr(rsp, 2*wordSize);
1267 }
1268 }
1269 }
1270
1271 static void verify_oop_args(MacroAssembler* masm,
1272 const methodHandle& method,
1273 const BasicType* sig_bt,
1274 const VMRegPair* regs) {
1275 Register temp_reg = rbx; // not part of any compiled calling seq
1276 if (VerifyOops) {
1277 for (int i = 0; i < method->size_of_parameters(); i++) {
1278 if (is_reference_type(sig_bt[i])) {
1279 VMReg r = regs[i].first();
1280 assert(r->is_valid(), "bad oop arg");
1281 if (r->is_stack()) {
1282 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1283 __ verify_oop(temp_reg);
1284 } else {
1285 __ verify_oop(r->as_Register());
1286 }
1287 }
1288 }
1289 }
1290 }
1291
1292 static void check_continuation_enter_argument(VMReg actual_vmreg,
1293 Register expected_reg,
1294 const char* name) {
1295 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1296 assert(actual_vmreg->as_Register() == expected_reg,
1297 "%s is in unexpected register: %s instead of %s",
1298 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1299 }
1300
1301
1302 //---------------------------- continuation_enter_setup ---------------------------
1303 //
1304 // Arguments:
1305 // None.
1306 //
1307 // Results:
1308 // rsp: pointer to blank ContinuationEntry
1309 //
1310 // Kills:
1311 // rax
1312 //
1313 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1314 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1315 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1316 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1317
1318 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1319 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1320
1321 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1322 OopMap* map = new OopMap(frame_size, 0);
1323
1324 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1325 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1326 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1327
1328 return map;
1329 }
1330
1331 //---------------------------- fill_continuation_entry ---------------------------
1332 //
1333 // Arguments:
1334 // rsp: pointer to blank Continuation entry
1335 // reg_cont_obj: pointer to the continuation
1336 // reg_flags: flags
1337 //
1338 // Results:
1339 // rsp: pointer to filled out ContinuationEntry
1340 //
1341 // Kills:
1342 // rax
1343 //
1344 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1345 assert_different_registers(rax, reg_cont_obj, reg_flags);
1346 #ifdef ASSERT
1347 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1348 #endif
1349 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1350 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1351 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1352 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1353 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1354
1355 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1356 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1357 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1358 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1359
1360 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1361 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1362 }
1363
1364 //---------------------------- continuation_enter_cleanup ---------------------------
1365 //
1366 // Arguments:
1367 // rsp: pointer to the ContinuationEntry
1368 //
1369 // Results:
1370 // rsp: pointer to the spilled rbp in the entry frame
1371 //
1372 // Kills:
1373 // rbx
1374 //
1375 static void continuation_enter_cleanup(MacroAssembler* masm) {
1376 #ifdef ASSERT
1377 Label L_good_sp;
1378 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1379 __ jcc(Assembler::equal, L_good_sp);
1380 __ stop("Incorrect rsp at continuation_enter_cleanup");
1381 __ bind(L_good_sp);
1382 #endif
1383 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1384 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1385
1386 if (CheckJNICalls) {
1387 // Check if this is a virtual thread continuation
1388 Label L_skip_vthread_code;
1389 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1390 __ jcc(Assembler::equal, L_skip_vthread_code);
1391
1392 // If the held monitor count is > 0 and this vthread is terminating then
1393 // it failed to release a JNI monitor. So we issue the same log message
1394 // that JavaThread::exit does.
1395 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1396 __ jcc(Assembler::equal, L_skip_vthread_code);
1397
1398 // rax may hold an exception oop, save it before the call
1399 __ push(rax);
1400 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1401 __ pop(rax);
1402
1403 // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1404 // on termination. The held count is implicitly zeroed below when we restore from
1405 // the parent held count (which has to be zero).
1406 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1407
1408 __ bind(L_skip_vthread_code);
1409 }
1410 #ifdef ASSERT
1411 else {
1412 // Check if this is a virtual thread continuation
1413 Label L_skip_vthread_code;
1414 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1415 __ jcc(Assembler::equal, L_skip_vthread_code);
1416
1417 // See comment just above. If not checking JNI calls the JNI count is only
1418 // needed for assertion checking.
1419 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1420
1421 __ bind(L_skip_vthread_code);
1422 }
1423 #endif
1424
1425 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1426 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1427
1428 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1429 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1430 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1431 }
1432
1433 static void gen_continuation_enter(MacroAssembler* masm,
1434 const VMRegPair* regs,
1435 int& exception_offset,
1436 OopMapSet* oop_maps,
1437 int& frame_complete,
1438 int& stack_slots,
1439 int& interpreted_entry_offset,
1440 int& compiled_entry_offset) {
1441
1442 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1443 int pos_cont_obj = 0;
1444 int pos_is_cont = 1;
1445 int pos_is_virtual = 2;
1446
1447 // The platform-specific calling convention may present the arguments in various registers.
1448 // To simplify the rest of the code, we expect the arguments to reside at these known
1449 // registers, and we additionally check the placement here in case calling convention ever
1450 // changes.
1451 Register reg_cont_obj = c_rarg1;
1452 Register reg_is_cont = c_rarg2;
1453 Register reg_is_virtual = c_rarg3;
1454
1455 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1456 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1457 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1458
1459 // Utility methods kill rax, make sure there are no collisions
1460 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1461
1462 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1463 relocInfo::static_call_type);
1464
1465 address start = __ pc();
1466
1467 Label L_thaw, L_exit;
1468
1469 // i2i entry used at interp_only_mode only
1470 interpreted_entry_offset = __ pc() - start;
1471 {
1472 #ifdef ASSERT
1473 Label is_interp_only;
1474 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1475 __ jcc(Assembler::notEqual, is_interp_only);
1476 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1477 __ bind(is_interp_only);
1478 #endif
1479
1480 __ pop(rax); // return address
1481 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1482 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1483 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1484 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1485 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1486 __ push(rax); // return address
1487 __ push_cont_fastpath();
1488
1489 __ enter();
1490
1491 stack_slots = 2; // will be adjusted in setup
1492 OopMap* map = continuation_enter_setup(masm, stack_slots);
1493 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1494 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1495
1496 __ verify_oop(reg_cont_obj);
1497
1498 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1499
1500 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1501 __ testptr(reg_is_cont, reg_is_cont);
1502 __ jcc(Assembler::notZero, L_thaw);
1503
1504 // --- Resolve path
1505
1506 // Make sure the call is patchable
1507 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1508 // Emit stub for static call
1509 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1510 if (stub == nullptr) {
1511 fatal("CodeCache is full at gen_continuation_enter");
1512 }
1513 __ call(resolve);
1514 oop_maps->add_gc_map(__ pc() - start, map);
1515 __ post_call_nop();
1516
1517 __ jmp(L_exit);
1518 }
1519
1520 // compiled entry
1521 __ align(CodeEntryAlignment);
1522 compiled_entry_offset = __ pc() - start;
1523 __ enter();
1524
1525 stack_slots = 2; // will be adjusted in setup
1526 OopMap* map = continuation_enter_setup(masm, stack_slots);
1527
1528 // Frame is now completed as far as size and linkage.
1529 frame_complete = __ pc() - start;
1530
1531 __ verify_oop(reg_cont_obj);
1532
1533 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1534
1535 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1536 __ testptr(reg_is_cont, reg_is_cont);
1537 __ jccb(Assembler::notZero, L_thaw);
1538
1539 // --- call Continuation.enter(Continuation c, boolean isContinue)
1540
1541 // Make sure the call is patchable
1542 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1543
1544 // Emit stub for static call
1545 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1546 if (stub == nullptr) {
1547 fatal("CodeCache is full at gen_continuation_enter");
1548 }
1549
1550 // The call needs to be resolved. There's a special case for this in
1551 // SharedRuntime::find_callee_info_helper() which calls
1552 // LinkResolver::resolve_continuation_enter() which resolves the call to
1553 // Continuation.enter(Continuation c, boolean isContinue).
1554 __ call(resolve);
1555
1556 oop_maps->add_gc_map(__ pc() - start, map);
1557 __ post_call_nop();
1558
1559 __ jmpb(L_exit);
1560
1561 // --- Thawing path
1562
1563 __ bind(L_thaw);
1564
1565 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1566 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1567
1568 ContinuationEntry::_return_pc_offset = __ pc() - start;
1569 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1570 __ post_call_nop();
1571
1572 // --- Normal exit (resolve/thawing)
1573
1574 __ bind(L_exit);
1575 ContinuationEntry::_cleanup_offset = __ pc() - start;
1576 continuation_enter_cleanup(masm);
1577 __ pop(rbp);
1578 __ ret(0);
1579
1580 // --- Exception handling path
1581
1582 exception_offset = __ pc() - start;
1583
1584 continuation_enter_cleanup(masm);
1585 __ pop(rbp);
1586
1587 __ movptr(c_rarg0, r15_thread);
1588 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1589
1590 // rax still holds the original exception oop, save it before the call
1591 __ push(rax);
1592
1593 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1594 __ movptr(rbx, rax);
1595
1596 // Continue at exception handler:
1597 // rax: exception oop
1598 // rbx: exception handler
1599 // rdx: exception pc
1600 __ pop(rax);
1601 __ verify_oop(rax);
1602 __ pop(rdx);
1603 __ jmp(rbx);
1604 }
1605
1606 static void gen_continuation_yield(MacroAssembler* masm,
1607 const VMRegPair* regs,
1608 OopMapSet* oop_maps,
1609 int& frame_complete,
1610 int& stack_slots,
1611 int& compiled_entry_offset) {
1612 enum layout {
1613 rbp_off,
1614 rbpH_off,
1615 return_off,
1616 return_off2,
1617 framesize // inclusive of return address
1618 };
1619 stack_slots = framesize / VMRegImpl::slots_per_word;
1620 assert(stack_slots == 2, "recheck layout");
1621
1622 address start = __ pc();
1623 compiled_entry_offset = __ pc() - start;
1624 __ enter();
1625 address the_pc = __ pc();
1626
1627 frame_complete = the_pc - start;
1628
1629 // This nop must be exactly at the PC we push into the frame info.
1630 // We use this nop for fast CodeBlob lookup, associate the OopMap
1631 // with it right away.
1632 __ post_call_nop();
1633 OopMap* map = new OopMap(framesize, 1);
1634 oop_maps->add_gc_map(frame_complete, map);
1635
1636 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1637 __ movptr(c_rarg0, r15_thread);
1638 __ movptr(c_rarg1, rsp);
1639 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1640 __ reset_last_Java_frame(true);
1641
1642 Label L_pinned;
1643
1644 __ testptr(rax, rax);
1645 __ jcc(Assembler::notZero, L_pinned);
1646
1647 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1648 continuation_enter_cleanup(masm);
1649 __ pop(rbp);
1650 __ ret(0);
1651
1652 __ bind(L_pinned);
1653
1654 // Pinned, return to caller
1655
1656 // handle pending exception thrown by freeze
1657 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1658 Label ok;
1659 __ jcc(Assembler::equal, ok);
1660 __ leave();
1661 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1662 __ bind(ok);
1663
1664 __ leave();
1665 __ ret(0);
1666 }
1667
1668 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1669 ::continuation_enter_cleanup(masm);
1670 }
1671
1672 static void gen_special_dispatch(MacroAssembler* masm,
1673 const methodHandle& method,
1674 const BasicType* sig_bt,
1675 const VMRegPair* regs) {
1676 verify_oop_args(masm, method, sig_bt, regs);
1677 vmIntrinsics::ID iid = method->intrinsic_id();
1678
1679 // Now write the args into the outgoing interpreter space
1680 bool has_receiver = false;
1681 Register receiver_reg = noreg;
1682 int member_arg_pos = -1;
1683 Register member_reg = noreg;
1684 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1685 if (ref_kind != 0) {
1686 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1687 member_reg = rbx; // known to be free at this point
1688 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1689 } else if (iid == vmIntrinsics::_invokeBasic) {
1690 has_receiver = true;
1691 } else if (iid == vmIntrinsics::_linkToNative) {
1692 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1693 member_reg = rbx; // known to be free at this point
1694 } else {
1695 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1696 }
1697
1698 if (member_reg != noreg) {
1699 // Load the member_arg into register, if necessary.
1700 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1701 VMReg r = regs[member_arg_pos].first();
1702 if (r->is_stack()) {
1703 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1704 } else {
1705 // no data motion is needed
1706 member_reg = r->as_Register();
1707 }
1708 }
1709
1710 if (has_receiver) {
1711 // Make sure the receiver is loaded into a register.
1712 assert(method->size_of_parameters() > 0, "oob");
1713 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1714 VMReg r = regs[0].first();
1715 assert(r->is_valid(), "bad receiver arg");
1716 if (r->is_stack()) {
1717 // Porting note: This assumes that compiled calling conventions always
1718 // pass the receiver oop in a register. If this is not true on some
1719 // platform, pick a temp and load the receiver from stack.
1720 fatal("receiver always in a register");
1721 receiver_reg = j_rarg0; // known to be free at this point
1722 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1723 } else {
1724 // no data motion is needed
1725 receiver_reg = r->as_Register();
1726 }
1727 }
1728
1729 // Figure out which address we are really jumping to:
1730 MethodHandles::generate_method_handle_dispatch(masm, iid,
1731 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1732 }
1733
1734 // ---------------------------------------------------------------------------
1735 // Generate a native wrapper for a given method. The method takes arguments
1736 // in the Java compiled code convention, marshals them to the native
1737 // convention (handlizes oops, etc), transitions to native, makes the call,
1738 // returns to java state (possibly blocking), unhandlizes any result and
1739 // returns.
1740 //
1741 // Critical native functions are a shorthand for the use of
1742 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1743 // functions. The wrapper is expected to unpack the arguments before
1744 // passing them to the callee. Critical native functions leave the state _in_Java,
1745 // since they cannot stop for GC.
1746 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1747 // block and the check for pending exceptions it's impossible for them
1748 // to be thrown.
1749 //
1750 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1751 const methodHandle& method,
1752 int compile_id,
1753 BasicType* in_sig_bt,
1754 VMRegPair* in_regs,
1755 BasicType ret_type) {
1756 if (method->is_continuation_native_intrinsic()) {
1757 int exception_offset = -1;
1758 OopMapSet* oop_maps = new OopMapSet();
1759 int frame_complete = -1;
1760 int stack_slots = -1;
1761 int interpreted_entry_offset = -1;
1762 int vep_offset = -1;
1763 if (method->is_continuation_enter_intrinsic()) {
1764 gen_continuation_enter(masm,
1765 in_regs,
1766 exception_offset,
1767 oop_maps,
1768 frame_complete,
1769 stack_slots,
1770 interpreted_entry_offset,
1771 vep_offset);
1772 } else if (method->is_continuation_yield_intrinsic()) {
1773 gen_continuation_yield(masm,
1774 in_regs,
1775 oop_maps,
1776 frame_complete,
1777 stack_slots,
1778 vep_offset);
1779 } else {
1780 guarantee(false, "Unknown Continuation native intrinsic");
1781 }
1782
1783 #ifdef ASSERT
1784 if (method->is_continuation_enter_intrinsic()) {
1785 assert(interpreted_entry_offset != -1, "Must be set");
1786 assert(exception_offset != -1, "Must be set");
1787 } else {
1788 assert(interpreted_entry_offset == -1, "Must be unset");
1789 assert(exception_offset == -1, "Must be unset");
1790 }
1791 assert(frame_complete != -1, "Must be set");
1792 assert(stack_slots != -1, "Must be set");
1793 assert(vep_offset != -1, "Must be set");
1794 #endif
1795
1796 __ flush();
1797 nmethod* nm = nmethod::new_native_nmethod(method,
1798 compile_id,
1799 masm->code(),
1800 vep_offset,
1801 frame_complete,
1802 stack_slots,
1803 in_ByteSize(-1),
1804 in_ByteSize(-1),
1805 oop_maps,
1806 exception_offset);
1807 if (nm == nullptr) return nm;
1808 if (method->is_continuation_enter_intrinsic()) {
1809 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1810 } else if (method->is_continuation_yield_intrinsic()) {
1811 _cont_doYield_stub = nm;
1812 }
1813 return nm;
1814 }
1815
1816 if (method->is_method_handle_intrinsic()) {
1817 vmIntrinsics::ID iid = method->intrinsic_id();
1818 intptr_t start = (intptr_t)__ pc();
1819 int vep_offset = ((intptr_t)__ pc()) - start;
1820 gen_special_dispatch(masm,
1821 method,
1822 in_sig_bt,
1823 in_regs);
1824 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1825 __ flush();
1826 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1827 return nmethod::new_native_nmethod(method,
1828 compile_id,
1829 masm->code(),
1830 vep_offset,
1831 frame_complete,
1832 stack_slots / VMRegImpl::slots_per_word,
1833 in_ByteSize(-1),
1834 in_ByteSize(-1),
1835 nullptr);
1836 }
1837 address native_func = method->native_function();
1838 assert(native_func != nullptr, "must have function");
1839
1840 // An OopMap for lock (and class if static)
1841 OopMapSet *oop_maps = new OopMapSet();
1842 intptr_t start = (intptr_t)__ pc();
1843
1844 // We have received a description of where all the java arg are located
1845 // on entry to the wrapper. We need to convert these args to where
1846 // the jni function will expect them. To figure out where they go
1847 // we convert the java signature to a C signature by inserting
1848 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1849
1850 const int total_in_args = method->size_of_parameters();
1851 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1852
1853 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1854 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1855
1856 int argc = 0;
1857 out_sig_bt[argc++] = T_ADDRESS;
1858 if (method->is_static()) {
1859 out_sig_bt[argc++] = T_OBJECT;
1860 }
1861
1862 for (int i = 0; i < total_in_args ; i++ ) {
1863 out_sig_bt[argc++] = in_sig_bt[i];
1864 }
1865
1866 // Now figure out where the args must be stored and how much stack space
1867 // they require.
1868 int out_arg_slots;
1869 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1870
1871 // Compute framesize for the wrapper. We need to handlize all oops in
1872 // incoming registers
1873
1874 // Calculate the total number of stack slots we will need.
1875
1876 // First count the abi requirement plus all of the outgoing args
1877 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1878
1879 // Now the space for the inbound oop handle area
1880 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1881
1882 int oop_handle_offset = stack_slots;
1883 stack_slots += total_save_slots;
1884
1885 // Now any space we need for handlizing a klass if static method
1886
1887 int klass_slot_offset = 0;
1888 int klass_offset = -1;
1889 int lock_slot_offset = 0;
1890 bool is_static = false;
1891
1892 if (method->is_static()) {
1893 klass_slot_offset = stack_slots;
1894 stack_slots += VMRegImpl::slots_per_word;
1895 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1896 is_static = true;
1897 }
1898
1899 // Plus a lock if needed
1900
1901 if (method->is_synchronized()) {
1902 lock_slot_offset = stack_slots;
1903 stack_slots += VMRegImpl::slots_per_word;
1904 }
1905
1906 // Now a place (+2) to save return values or temp during shuffling
1907 // + 4 for return address (which we own) and saved rbp
1908 stack_slots += 6;
1909
1910 // Ok The space we have allocated will look like:
1911 //
1912 //
1913 // FP-> | |
1914 // |---------------------|
1915 // | 2 slots for moves |
1916 // |---------------------|
1917 // | lock box (if sync) |
1918 // |---------------------| <- lock_slot_offset
1919 // | klass (if static) |
1920 // |---------------------| <- klass_slot_offset
1921 // | oopHandle area |
1922 // |---------------------| <- oop_handle_offset (6 java arg registers)
1923 // | outbound memory |
1924 // | based arguments |
1925 // | |
1926 // |---------------------|
1927 // | |
1928 // SP-> | out_preserved_slots |
1929 //
1930 //
1931
1932
1933 // Now compute actual number of stack words we need rounding to make
1934 // stack properly aligned.
1935 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1936
1937 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1938
1939 // First thing make an ic check to see if we should even be here
1940
1941 // We are free to use all registers as temps without saving them and
1942 // restoring them except rbp. rbp is the only callee save register
1943 // as far as the interpreter and the compiler(s) are concerned.
1944
1945 const Register receiver = j_rarg0;
1946
1947 Label exception_pending;
1948
1949 assert_different_registers(receiver, rscratch1, rscratch2);
1950 __ verify_oop(receiver);
1951 __ ic_check(8 /* end_alignment */);
1952
1953 int vep_offset = ((intptr_t)__ pc()) - start;
1954
1955 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1956 Label L_skip_barrier;
1957 Register klass = r10;
1958 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1959 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1960
1961 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1962
1963 __ bind(L_skip_barrier);
1964 }
1965
1966 #ifdef COMPILER1
1967 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1968 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1969 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1970 }
1971 #endif // COMPILER1
1972
1973 // The instruction at the verified entry point must be 5 bytes or longer
1974 // because it can be patched on the fly by make_non_entrant. The stack bang
1975 // instruction fits that requirement.
1976
1977 // Generate stack overflow check
1978 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1979
1980 // Generate a new frame for the wrapper.
1981 __ enter();
1982 // -2 because return address is already present and so is saved rbp
1983 __ subptr(rsp, stack_size - 2*wordSize);
1984
1985 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1986 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1987 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1988
1989 // Frame is now completed as far as size and linkage.
1990 int frame_complete = ((intptr_t)__ pc()) - start;
1991
1992 #ifdef ASSERT
1993 __ check_stack_alignment(rsp, "improperly aligned stack");
1994 #endif /* ASSERT */
1995
1996
1997 // We use r14 as the oop handle for the receiver/klass
1998 // It is callee save so it survives the call to native
1999
2000 const Register oop_handle_reg = r14;
2001
2002 //
2003 // We immediately shuffle the arguments so that any vm call we have to
2004 // make from here on out (sync slow path, jvmti, etc.) we will have
2005 // captured the oops from our caller and have a valid oopMap for
2006 // them.
2007
2008 // -----------------
2009 // The Grand Shuffle
2010
2011 // The Java calling convention is either equal (linux) or denser (win64) than the
2012 // c calling convention. However the because of the jni_env argument the c calling
2013 // convention always has at least one more (and two for static) arguments than Java.
2014 // Therefore if we move the args from java -> c backwards then we will never have
2015 // a register->register conflict and we don't have to build a dependency graph
2016 // and figure out how to break any cycles.
2017 //
2018
2019 // Record esp-based slot for receiver on stack for non-static methods
2020 int receiver_offset = -1;
2021
2022 // This is a trick. We double the stack slots so we can claim
2023 // the oops in the caller's frame. Since we are sure to have
2024 // more args than the caller doubling is enough to make
2025 // sure we can capture all the incoming oop args from the
2026 // caller.
2027 //
2028 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2029
2030 // Mark location of rbp (someday)
2031 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2032
2033 // Use eax, ebx as temporaries during any memory-memory moves we have to do
2034 // All inbound args are referenced based on rbp and all outbound args via rsp.
2035
2036
2037 #ifdef ASSERT
2038 bool reg_destroyed[Register::number_of_registers];
2039 bool freg_destroyed[XMMRegister::number_of_registers];
2040 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2041 reg_destroyed[r] = false;
2042 }
2043 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2044 freg_destroyed[f] = false;
2045 }
2046
2047 #endif /* ASSERT */
2048
2049 // For JNI natives the incoming and outgoing registers are offset upwards.
2050 GrowableArray<int> arg_order(2 * total_in_args);
2051
2052 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2053 arg_order.push(i);
2054 arg_order.push(c_arg);
2055 }
2056
2057 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2058 int i = arg_order.at(ai);
2059 int c_arg = arg_order.at(ai + 1);
2060 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2061 #ifdef ASSERT
2062 if (in_regs[i].first()->is_Register()) {
2063 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2064 } else if (in_regs[i].first()->is_XMMRegister()) {
2065 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2066 }
2067 if (out_regs[c_arg].first()->is_Register()) {
2068 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2069 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2070 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2071 }
2072 #endif /* ASSERT */
2073 switch (in_sig_bt[i]) {
2074 case T_ARRAY:
2075 case T_OBJECT:
2076 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2077 ((i == 0) && (!is_static)),
2078 &receiver_offset);
2079 break;
2080 case T_VOID:
2081 break;
2082
2083 case T_FLOAT:
2084 __ float_move(in_regs[i], out_regs[c_arg]);
2085 break;
2086
2087 case T_DOUBLE:
2088 assert( i + 1 < total_in_args &&
2089 in_sig_bt[i + 1] == T_VOID &&
2090 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2091 __ double_move(in_regs[i], out_regs[c_arg]);
2092 break;
2093
2094 case T_LONG :
2095 __ long_move(in_regs[i], out_regs[c_arg]);
2096 break;
2097
2098 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2099
2100 default:
2101 __ move32_64(in_regs[i], out_regs[c_arg]);
2102 }
2103 }
2104
2105 int c_arg;
2106
2107 // Pre-load a static method's oop into r14. Used both by locking code and
2108 // the normal JNI call code.
2109 // point c_arg at the first arg that is already loaded in case we
2110 // need to spill before we call out
2111 c_arg = total_c_args - total_in_args;
2112
2113 if (method->is_static()) {
2114
2115 // load oop into a register
2116 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2117
2118 // Now handlize the static class mirror it's known not-null.
2119 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2120 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2121
2122 // Now get the handle
2123 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2124 // store the klass handle as second argument
2125 __ movptr(c_rarg1, oop_handle_reg);
2126 // and protect the arg if we must spill
2127 c_arg--;
2128 }
2129
2130 // Change state to native (we save the return address in the thread, since it might not
2131 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2132 // points into the right code segment. It does not have to be the correct return pc.
2133 // We use the same pc/oopMap repeatedly when we call out
2134
2135 Label native_return;
2136 if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2137 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2138 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2139 } else {
2140 intptr_t the_pc = (intptr_t) __ pc();
2141 oop_maps->add_gc_map(the_pc - start, map);
2142
2143 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2144 }
2145
2146 // We have all of the arguments setup at this point. We must not touch any register
2147 // argument registers at this point (what if we save/restore them there are no oop?
2148
2149 if (DTraceMethodProbes) {
2150 // protect the args we've loaded
2151 save_args(masm, total_c_args, c_arg, out_regs);
2152 __ mov_metadata(c_rarg1, method());
2153 __ call_VM_leaf(
2154 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2155 r15_thread, c_rarg1);
2156 restore_args(masm, total_c_args, c_arg, out_regs);
2157 }
2158
2159 // RedefineClasses() tracing support for obsolete method entry
2160 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2161 // protect the args we've loaded
2162 save_args(masm, total_c_args, c_arg, out_regs);
2163 __ mov_metadata(c_rarg1, method());
2164 __ call_VM_leaf(
2165 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2166 r15_thread, c_rarg1);
2167 restore_args(masm, total_c_args, c_arg, out_regs);
2168 }
2169
2170 // Lock a synchronized method
2171
2172 // Register definitions used by locking and unlocking
2173
2174 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2175 const Register obj_reg = rbx; // Will contain the oop
2176 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2177 const Register old_hdr = r13; // value of old header at unlock time
2178
2179 Label slow_path_lock;
2180 Label lock_done;
2181
2182 if (method->is_synchronized()) {
2183 Label count_mon;
2184
2185 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2186
2187 // Get the handle (the 2nd argument)
2188 __ mov(oop_handle_reg, c_rarg1);
2189
2190 // Get address of the box
2191
2192 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2193
2194 // Load the oop from the handle
2195 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2196
2197 if (LockingMode == LM_MONITOR) {
2198 __ jmp(slow_path_lock);
2199 } else if (LockingMode == LM_LEGACY) {
2200 // Load immediate 1 into swap_reg %rax
2201 __ movl(swap_reg, 1);
2202
2203 // Load (object->mark() | 1) into swap_reg %rax
2204 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2205
2206 // Save (object->mark() | 1) into BasicLock's displaced header
2207 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2208
2209 // src -> dest iff dest == rax else rax <- dest
2210 __ lock();
2211 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2212 __ jcc(Assembler::equal, count_mon);
2213
2214 // Hmm should this move to the slow path code area???
2215
2216 // Test if the oopMark is an obvious stack pointer, i.e.,
2217 // 1) (mark & 3) == 0, and
2218 // 2) rsp <= mark < mark + os::pagesize()
2219 // These 3 tests can be done by evaluating the following
2220 // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2221 // assuming both stack pointer and pagesize have their
2222 // least significant 2 bits clear.
2223 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2224
2225 __ subptr(swap_reg, rsp);
2226 __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2227
2228 // Save the test result, for recursive case, the result is zero
2229 __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2230 __ jcc(Assembler::notEqual, slow_path_lock);
2231
2232 __ bind(count_mon);
2233 __ inc_held_monitor_count();
2234 } else {
2235 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2236 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2237 }
2238
2239 // Slow path will re-enter here
2240 __ bind(lock_done);
2241 }
2242
2243 // Finally just about ready to make the JNI call
2244
2245 // get JNIEnv* which is first argument to native
2246 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2247
2248 // Now set thread in native
2249 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2250
2251 __ call(RuntimeAddress(native_func));
2252
2253 // Verify or restore cpu control state after JNI call
2254 __ restore_cpu_control_state_after_jni(rscratch1);
2255
2256 // Unpack native results.
2257 switch (ret_type) {
2258 case T_BOOLEAN: __ c2bool(rax); break;
2259 case T_CHAR : __ movzwl(rax, rax); break;
2260 case T_BYTE : __ sign_extend_byte (rax); break;
2261 case T_SHORT : __ sign_extend_short(rax); break;
2262 case T_INT : /* nothing to do */ break;
2263 case T_DOUBLE :
2264 case T_FLOAT :
2265 // Result is in xmm0 we'll save as needed
2266 break;
2267 case T_ARRAY: // Really a handle
2268 case T_OBJECT: // Really a handle
2269 break; // can't de-handlize until after safepoint check
2270 case T_VOID: break;
2271 case T_LONG: break;
2272 default : ShouldNotReachHere();
2273 }
2274
2275 // Switch thread to "native transition" state before reading the synchronization state.
2276 // This additional state is necessary because reading and testing the synchronization
2277 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2278 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2279 // VM thread changes sync state to synchronizing and suspends threads for GC.
2280 // Thread A is resumed to finish this native method, but doesn't block here since it
2281 // didn't see any synchronization is progress, and escapes.
2282 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2283
2284 // Force this write out before the read below
2285 if (!UseSystemMemoryBarrier) {
2286 __ membar(Assembler::Membar_mask_bits(
2287 Assembler::LoadLoad | Assembler::LoadStore |
2288 Assembler::StoreLoad | Assembler::StoreStore));
2289 }
2290
2291 // check for safepoint operation in progress and/or pending suspend requests
2292 {
2293 Label Continue;
2294 Label slow_path;
2295
2296 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2297
2298 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2299 __ jcc(Assembler::equal, Continue);
2300 __ bind(slow_path);
2301
2302 // Don't use call_VM as it will see a possible pending exception and forward it
2303 // and never return here preventing us from clearing _last_native_pc down below.
2304 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2305 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2306 // by hand.
2307 //
2308 __ vzeroupper();
2309 save_native_result(masm, ret_type, stack_slots);
2310 __ mov(c_rarg0, r15_thread);
2311 __ mov(r12, rsp); // remember sp
2312 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2313 __ andptr(rsp, -16); // align stack as required by ABI
2314 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2315 __ mov(rsp, r12); // restore sp
2316 __ reinit_heapbase();
2317 // Restore any method result value
2318 restore_native_result(masm, ret_type, stack_slots);
2319 __ bind(Continue);
2320 }
2321
2322 // change thread state
2323 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2324
2325 if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2326 // Check preemption for Object.wait()
2327 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2328 __ cmpptr(rscratch1, NULL_WORD);
2329 __ jccb(Assembler::equal, native_return);
2330 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2331 __ jmp(rscratch1);
2332 __ bind(native_return);
2333
2334 intptr_t the_pc = (intptr_t) __ pc();
2335 oop_maps->add_gc_map(the_pc - start, map);
2336 }
2337
2338
2339 Label reguard;
2340 Label reguard_done;
2341 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2342 __ jcc(Assembler::equal, reguard);
2343 __ bind(reguard_done);
2344
2345 // native result if any is live
2346
2347 // Unlock
2348 Label slow_path_unlock;
2349 Label unlock_done;
2350 if (method->is_synchronized()) {
2351
2352 Label fast_done;
2353
2354 // Get locked oop from the handle we passed to jni
2355 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2356
2357 if (LockingMode == LM_LEGACY) {
2358 Label not_recur;
2359 // Simple recursive lock?
2360 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2361 __ jcc(Assembler::notEqual, not_recur);
2362 __ dec_held_monitor_count();
2363 __ jmpb(fast_done);
2364 __ bind(not_recur);
2365 }
2366
2367 // Must save rax if it is live now because cmpxchg must use it
2368 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2369 save_native_result(masm, ret_type, stack_slots);
2370 }
2371
2372 if (LockingMode == LM_MONITOR) {
2373 __ jmp(slow_path_unlock);
2374 } else if (LockingMode == LM_LEGACY) {
2375 // get address of the stack lock
2376 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2377 // get old displaced header
2378 __ movptr(old_hdr, Address(rax, 0));
2379
2380 // Atomic swap old header if oop still contains the stack lock
2381 __ lock();
2382 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2383 __ jcc(Assembler::notEqual, slow_path_unlock);
2384 __ dec_held_monitor_count();
2385 } else {
2386 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2387 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2388 }
2389
2390 // slow path re-enters here
2391 __ bind(unlock_done);
2392 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2393 restore_native_result(masm, ret_type, stack_slots);
2394 }
2395
2396 __ bind(fast_done);
2397 }
2398 if (DTraceMethodProbes) {
2399 save_native_result(masm, ret_type, stack_slots);
2400 __ mov_metadata(c_rarg1, method());
2401 __ call_VM_leaf(
2402 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2403 r15_thread, c_rarg1);
2404 restore_native_result(masm, ret_type, stack_slots);
2405 }
2406
2407 __ reset_last_Java_frame(false);
2408
2409 // Unbox oop result, e.g. JNIHandles::resolve value.
2410 if (is_reference_type(ret_type)) {
2411 __ resolve_jobject(rax /* value */,
2412 rcx /* tmp */);
2413 }
2414
2415 if (CheckJNICalls) {
2416 // clear_pending_jni_exception_check
2417 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2418 }
2419
2420 // reset handle block
2421 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2422 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2423
2424 // pop our frame
2425
2426 __ leave();
2427
2428 #if INCLUDE_JFR
2429 // We need to do a poll test after unwind in case the sampler
2430 // managed to sample the native frame after returning to Java.
2431 Label L_return;
2432 address poll_test_pc = __ pc();
2433 __ relocate(relocInfo::poll_return_type);
2434 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2435 __ jccb(Assembler::zero, L_return);
2436 __ lea(rscratch1, InternalAddress(poll_test_pc));
2437 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2438 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2439 "polling page return stub not created yet");
2440 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2441 __ jump(RuntimeAddress(stub));
2442 __ bind(L_return);
2443 #endif // INCLUDE_JFR
2444
2445 // Any exception pending?
2446 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2447 __ jcc(Assembler::notEqual, exception_pending);
2448
2449 // Return
2450
2451 __ ret(0);
2452
2453 // Unexpected paths are out of line and go here
2454
2455 // forward the exception
2456 __ bind(exception_pending);
2457
2458 // and forward the exception
2459 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2460
2461 // Slow path locking & unlocking
2462 if (method->is_synchronized()) {
2463
2464 // BEGIN Slow path lock
2465 __ bind(slow_path_lock);
2466
2467 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2468 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2469
2470 // protect the args we've loaded
2471 save_args(masm, total_c_args, c_arg, out_regs);
2472
2473 __ mov(c_rarg0, obj_reg);
2474 __ mov(c_rarg1, lock_reg);
2475 __ mov(c_rarg2, r15_thread);
2476
2477 // Not a leaf but we have last_Java_frame setup as we want.
2478 // We don't want to unmount in case of contention since that would complicate preserving
2479 // the arguments that had already been marshalled into the native convention. So we force
2480 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2481 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2482 __ push_cont_fastpath();
2483 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2484 __ pop_cont_fastpath();
2485 restore_args(masm, total_c_args, c_arg, out_regs);
2486
2487 #ifdef ASSERT
2488 { Label L;
2489 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2490 __ jcc(Assembler::equal, L);
2491 __ stop("no pending exception allowed on exit from monitorenter");
2492 __ bind(L);
2493 }
2494 #endif
2495 __ jmp(lock_done);
2496
2497 // END Slow path lock
2498
2499 // BEGIN Slow path unlock
2500 __ bind(slow_path_unlock);
2501
2502 // If we haven't already saved the native result we must save it now as xmm registers
2503 // are still exposed.
2504 __ vzeroupper();
2505 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2506 save_native_result(masm, ret_type, stack_slots);
2507 }
2508
2509 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2510
2511 __ mov(c_rarg0, obj_reg);
2512 __ mov(c_rarg2, r15_thread);
2513 __ mov(r12, rsp); // remember sp
2514 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2515 __ andptr(rsp, -16); // align stack as required by ABI
2516
2517 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2518 // NOTE that obj_reg == rbx currently
2519 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2520 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2521
2522 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2523 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2524 __ mov(rsp, r12); // restore sp
2525 __ reinit_heapbase();
2526 #ifdef ASSERT
2527 {
2528 Label L;
2529 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2530 __ jcc(Assembler::equal, L);
2531 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2532 __ bind(L);
2533 }
2534 #endif /* ASSERT */
2535
2536 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2537
2538 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2539 restore_native_result(masm, ret_type, stack_slots);
2540 }
2541 __ jmp(unlock_done);
2542
2543 // END Slow path unlock
2544
2545 } // synchronized
2546
2547 // SLOW PATH Reguard the stack if needed
2548
2549 __ bind(reguard);
2550 __ vzeroupper();
2551 save_native_result(masm, ret_type, stack_slots);
2552 __ mov(r12, rsp); // remember sp
2553 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2554 __ andptr(rsp, -16); // align stack as required by ABI
2555 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2556 __ mov(rsp, r12); // restore sp
2557 __ reinit_heapbase();
2558 restore_native_result(masm, ret_type, stack_slots);
2559 // and continue
2560 __ jmp(reguard_done);
2561
2562
2563
2564 __ flush();
2565
2566 nmethod *nm = nmethod::new_native_nmethod(method,
2567 compile_id,
2568 masm->code(),
2569 vep_offset,
2570 frame_complete,
2571 stack_slots / VMRegImpl::slots_per_word,
2572 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2573 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2574 oop_maps);
2575
2576 return nm;
2577 }
2578
2579 // this function returns the adjust size (in number of words) to a c2i adapter
2580 // activation for use during deoptimization
2581 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2582 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2583 }
2584
2585
2586 uint SharedRuntime::out_preserve_stack_slots() {
2587 return 0;
2588 }
2589
2590
2591 // Number of stack slots between incoming argument block and the start of
2592 // a new frame. The PROLOG must add this many slots to the stack. The
2593 // EPILOG must remove this many slots. amd64 needs two slots for
2594 // return address.
2595 uint SharedRuntime::in_preserve_stack_slots() {
2596 return 4 + 2 * VerifyStackAtCalls;
2597 }
2598
2599 VMReg SharedRuntime::thread_register() {
2600 return r15_thread->as_VMReg();
2601 }
2602
2603 //------------------------------generate_deopt_blob----------------------------
2604 void SharedRuntime::generate_deopt_blob() {
2605 // Allocate space for the code
2606 ResourceMark rm;
2607 // Setup code generation tools
2608 int pad = 0;
2609 if (UseAVX > 2) {
2610 pad += 1024;
2611 }
2612 if (UseAPX) {
2613 pad += 1024;
2614 }
2615 #if INCLUDE_JVMCI
2616 if (EnableJVMCI) {
2617 pad += 512; // Increase the buffer size when compiling for JVMCI
2618 }
2619 #endif
2620 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2621 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
2622 if (blob != nullptr) {
2623 _deopt_blob = blob->as_deoptimization_blob();
2624 return;
2625 }
2626
2627 CodeBuffer buffer(name, 2560+pad, 1024);
2628 MacroAssembler* masm = new MacroAssembler(&buffer);
2629 int frame_size_in_words;
2630 OopMap* map = nullptr;
2631 OopMapSet *oop_maps = new OopMapSet();
2632
2633 // -------------
2634 // This code enters when returning to a de-optimized nmethod. A return
2635 // address has been pushed on the stack, and return values are in
2636 // registers.
2637 // If we are doing a normal deopt then we were called from the patched
2638 // nmethod from the point we returned to the nmethod. So the return
2639 // address on the stack is wrong by NativeCall::instruction_size
2640 // We will adjust the value so it looks like we have the original return
2641 // address on the stack (like when we eagerly deoptimized).
2642 // In the case of an exception pending when deoptimizing, we enter
2643 // with a return address on the stack that points after the call we patched
2644 // into the exception handler. We have the following register state from,
2645 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2646 // rax: exception oop
2647 // rbx: exception handler
2648 // rdx: throwing pc
2649 // So in this case we simply jam rdx into the useless return address and
2650 // the stack looks just like we want.
2651 //
2652 // At this point we need to de-opt. We save the argument return
2653 // registers. We call the first C routine, fetch_unroll_info(). This
2654 // routine captures the return values and returns a structure which
2655 // describes the current frame size and the sizes of all replacement frames.
2656 // The current frame is compiled code and may contain many inlined
2657 // functions, each with their own JVM state. We pop the current frame, then
2658 // push all the new frames. Then we call the C routine unpack_frames() to
2659 // populate these frames. Finally unpack_frames() returns us the new target
2660 // address. Notice that callee-save registers are BLOWN here; they have
2661 // already been captured in the vframeArray at the time the return PC was
2662 // patched.
2663 address start = __ pc();
2664 Label cont;
2665
2666 // Prolog for non exception case!
2667
2668 // Save everything in sight.
2669 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2670
2671 // Normal deoptimization. Save exec mode for unpack_frames.
2672 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2673 __ jmp(cont);
2674
2675 int reexecute_offset = __ pc() - start;
2676 #if INCLUDE_JVMCI && !defined(COMPILER1)
2677 if (UseJVMCICompiler) {
2678 // JVMCI does not use this kind of deoptimization
2679 __ should_not_reach_here();
2680 }
2681 #endif
2682
2683 // Reexecute case
2684 // return address is the pc describes what bci to do re-execute at
2685
2686 // No need to update map as each call to save_live_registers will produce identical oopmap
2687 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2688
2689 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2690 __ jmp(cont);
2691
2692 #if INCLUDE_JVMCI
2693 Label after_fetch_unroll_info_call;
2694 int implicit_exception_uncommon_trap_offset = 0;
2695 int uncommon_trap_offset = 0;
2696
2697 if (EnableJVMCI) {
2698 implicit_exception_uncommon_trap_offset = __ pc() - start;
2699
2700 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2701 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2702
2703 uncommon_trap_offset = __ pc() - start;
2704
2705 // Save everything in sight.
2706 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2707 // fetch_unroll_info needs to call last_java_frame()
2708 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2709
2710 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2711 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2712
2713 __ movl(r14, Deoptimization::Unpack_reexecute);
2714 __ mov(c_rarg0, r15_thread);
2715 __ movl(c_rarg2, r14); // exec mode
2716 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2717 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2718
2719 __ reset_last_Java_frame(false);
2720
2721 __ jmp(after_fetch_unroll_info_call);
2722 } // EnableJVMCI
2723 #endif // INCLUDE_JVMCI
2724
2725 int exception_offset = __ pc() - start;
2726
2727 // Prolog for exception case
2728
2729 // all registers are dead at this entry point, except for rax, and
2730 // rdx which contain the exception oop and exception pc
2731 // respectively. Set them in TLS and fall thru to the
2732 // unpack_with_exception_in_tls entry point.
2733
2734 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2735 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2736
2737 int exception_in_tls_offset = __ pc() - start;
2738
2739 // new implementation because exception oop is now passed in JavaThread
2740
2741 // Prolog for exception case
2742 // All registers must be preserved because they might be used by LinearScan
2743 // Exceptiop oop and throwing PC are passed in JavaThread
2744 // tos: stack at point of call to method that threw the exception (i.e. only
2745 // args are on the stack, no return address)
2746
2747 // make room on stack for the return address
2748 // It will be patched later with the throwing pc. The correct value is not
2749 // available now because loading it from memory would destroy registers.
2750 __ push(0);
2751
2752 // Save everything in sight.
2753 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2754
2755 // Now it is safe to overwrite any register
2756
2757 // Deopt during an exception. Save exec mode for unpack_frames.
2758 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2759
2760 // load throwing pc from JavaThread and patch it as the return address
2761 // of the current frame. Then clear the field in JavaThread
2762
2763 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2764 __ movptr(Address(rbp, wordSize), rdx);
2765 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2766
2767 #ifdef ASSERT
2768 // verify that there is really an exception oop in JavaThread
2769 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2770 __ verify_oop(rax);
2771
2772 // verify that there is no pending exception
2773 Label no_pending_exception;
2774 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2775 __ testptr(rax, rax);
2776 __ jcc(Assembler::zero, no_pending_exception);
2777 __ stop("must not have pending exception here");
2778 __ bind(no_pending_exception);
2779 #endif
2780
2781 __ bind(cont);
2782
2783 // Call C code. Need thread and this frame, but NOT official VM entry
2784 // crud. We cannot block on this call, no GC can happen.
2785 //
2786 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2787
2788 // fetch_unroll_info needs to call last_java_frame().
2789
2790 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2791 #ifdef ASSERT
2792 { Label L;
2793 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2794 __ jcc(Assembler::equal, L);
2795 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2796 __ bind(L);
2797 }
2798 #endif // ASSERT
2799 __ mov(c_rarg0, r15_thread);
2800 __ movl(c_rarg1, r14); // exec_mode
2801 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2802
2803 // Need to have an oopmap that tells fetch_unroll_info where to
2804 // find any register it might need.
2805 oop_maps->add_gc_map(__ pc() - start, map);
2806
2807 __ reset_last_Java_frame(false);
2808
2809 #if INCLUDE_JVMCI
2810 if (EnableJVMCI) {
2811 __ bind(after_fetch_unroll_info_call);
2812 }
2813 #endif
2814
2815 // Load UnrollBlock* into rdi
2816 __ mov(rdi, rax);
2817
2818 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2819 Label noException;
2820 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2821 __ jcc(Assembler::notEqual, noException);
2822 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2823 // QQQ this is useless it was null above
2824 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2825 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2826 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2827
2828 __ verify_oop(rax);
2829
2830 // Overwrite the result registers with the exception results.
2831 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2832 // I think this is useless
2833 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2834
2835 __ bind(noException);
2836
2837 // Only register save data is on the stack.
2838 // Now restore the result registers. Everything else is either dead
2839 // or captured in the vframeArray.
2840 RegisterSaver::restore_result_registers(masm);
2841
2842 // All of the register save area has been popped of the stack. Only the
2843 // return address remains.
2844
2845 // Pop all the frames we must move/replace.
2846 //
2847 // Frame picture (youngest to oldest)
2848 // 1: self-frame (no frame link)
2849 // 2: deopting frame (no frame link)
2850 // 3: caller of deopting frame (could be compiled/interpreted).
2851 //
2852 // Note: by leaving the return address of self-frame on the stack
2853 // and using the size of frame 2 to adjust the stack
2854 // when we are done the return to frame 3 will still be on the stack.
2855
2856 // Pop deoptimized frame
2857 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2858 __ addptr(rsp, rcx);
2859
2860 // rsp should be pointing at the return address to the caller (3)
2861
2862 // Pick up the initial fp we should save
2863 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2864 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2865
2866 #ifdef ASSERT
2867 // Compilers generate code that bang the stack by as much as the
2868 // interpreter would need. So this stack banging should never
2869 // trigger a fault. Verify that it does not on non product builds.
2870 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2871 __ bang_stack_size(rbx, rcx);
2872 #endif
2873
2874 // Load address of array of frame pcs into rcx
2875 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2876
2877 // Trash the old pc
2878 __ addptr(rsp, wordSize);
2879
2880 // Load address of array of frame sizes into rsi
2881 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2882
2883 // Load counter into rdx
2884 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2885
2886 // Now adjust the caller's stack to make up for the extra locals
2887 // but record the original sp so that we can save it in the skeletal interpreter
2888 // frame and the stack walking of interpreter_sender will get the unextended sp
2889 // value and not the "real" sp value.
2890
2891 const Register sender_sp = r8;
2892
2893 __ mov(sender_sp, rsp);
2894 __ movl(rbx, Address(rdi,
2895 Deoptimization::UnrollBlock::
2896 caller_adjustment_offset()));
2897 __ subptr(rsp, rbx);
2898
2899 // Push interpreter frames in a loop
2900 Label loop;
2901 __ bind(loop);
2902 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2903 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2904 __ pushptr(Address(rcx, 0)); // Save return address
2905 __ enter(); // Save old & set new ebp
2906 __ subptr(rsp, rbx); // Prolog
2907 // This value is corrected by layout_activation_impl
2908 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2909 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2910 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2911 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2912 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2913 __ decrementl(rdx); // Decrement counter
2914 __ jcc(Assembler::notZero, loop);
2915 __ pushptr(Address(rcx, 0)); // Save final return address
2916
2917 // Re-push self-frame
2918 __ enter(); // Save old & set new ebp
2919
2920 // Allocate a full sized register save area.
2921 // Return address and rbp are in place, so we allocate two less words.
2922 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2923
2924 // Restore frame locals after moving the frame
2925 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2926 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2927
2928 // Call C code. Need thread but NOT official VM entry
2929 // crud. We cannot block on this call, no GC can happen. Call should
2930 // restore return values to their stack-slots with the new SP.
2931 //
2932 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2933
2934 // Use rbp because the frames look interpreted now
2935 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2936 // Don't need the precise return PC here, just precise enough to point into this code blob.
2937 address the_pc = __ pc();
2938 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2939
2940 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2941 __ mov(c_rarg0, r15_thread);
2942 __ movl(c_rarg1, r14); // second arg: exec_mode
2943 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2944 // Revert SP alignment after call since we're going to do some SP relative addressing below
2945 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2946
2947 // Set an oopmap for the call site
2948 // Use the same PC we used for the last java frame
2949 oop_maps->add_gc_map(the_pc - start,
2950 new OopMap( frame_size_in_words, 0 ));
2951
2952 // Clear fp AND pc
2953 __ reset_last_Java_frame(true);
2954
2955 // Collect return values
2956 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2957 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2958 // I think this is useless (throwing pc?)
2959 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2960
2961 // Pop self-frame.
2962 __ leave(); // Epilog
2963
2964 // Jump to interpreter
2965 __ ret(0);
2966
2967 // Make sure all code is generated
2968 masm->flush();
2969
2970 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2971 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2972 #if INCLUDE_JVMCI
2973 if (EnableJVMCI) {
2974 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2975 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2976 }
2977 #endif
2978
2979 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
2980 }
2981
2982 //------------------------------generate_handler_blob------
2983 //
2984 // Generate a special Compile2Runtime blob that saves all registers,
2985 // and setup oopmap.
2986 //
2987 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
2988 assert(StubRoutines::forward_exception_entry() != nullptr,
2989 "must be generated before");
2990 assert(is_polling_page_id(id), "expected a polling page stub id");
2991
2992 // Allocate space for the code. Setup code generation tools.
2993 const char* name = SharedRuntime::stub_name(id);
2994 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
2995 if (blob != nullptr) {
2996 return blob->as_safepoint_blob();
2997 }
2998
2999 ResourceMark rm;
3000 OopMapSet *oop_maps = new OopMapSet();
3001 OopMap* map;
3002 CodeBuffer buffer(name, 2548, 1024);
3003 MacroAssembler* masm = new MacroAssembler(&buffer);
3004
3005 address start = __ pc();
3006 address call_pc = nullptr;
3007 int frame_size_in_words;
3008 bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3009 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3010
3011 // Make room for return address (or push it again)
3012 if (!cause_return) {
3013 __ push(rbx);
3014 }
3015
3016 // Save registers, fpu state, and flags
3017 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3018
3019 // The following is basically a call_VM. However, we need the precise
3020 // address of the call in order to generate an oopmap. Hence, we do all the
3021 // work ourselves.
3022
3023 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3024
3025 // The return address must always be correct so that frame constructor never
3026 // sees an invalid pc.
3027
3028 if (!cause_return) {
3029 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3030 // Additionally, rbx is a callee saved register and we can look at it later to determine
3031 // if someone changed the return address for us!
3032 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3033 __ movptr(Address(rbp, wordSize), rbx);
3034 }
3035
3036 // Do the call
3037 __ mov(c_rarg0, r15_thread);
3038 __ call(RuntimeAddress(call_ptr));
3039
3040 // Set an oopmap for the call site. This oopmap will map all
3041 // oop-registers and debug-info registers as callee-saved. This
3042 // will allow deoptimization at this safepoint to find all possible
3043 // debug-info recordings, as well as let GC find all oops.
3044
3045 oop_maps->add_gc_map( __ pc() - start, map);
3046
3047 Label noException;
3048
3049 __ reset_last_Java_frame(false);
3050
3051 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3052 __ jcc(Assembler::equal, noException);
3053
3054 // Exception pending
3055
3056 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3057
3058 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3059
3060 // No exception case
3061 __ bind(noException);
3062
3063 Label no_adjust;
3064 #ifdef ASSERT
3065 Label bail;
3066 #endif
3067 if (!cause_return) {
3068 Label no_prefix, not_special, check_rex_prefix;
3069
3070 // If our stashed return pc was modified by the runtime we avoid touching it
3071 __ cmpptr(rbx, Address(rbp, wordSize));
3072 __ jcc(Assembler::notEqual, no_adjust);
3073
3074 // Skip over the poll instruction.
3075 // See NativeInstruction::is_safepoint_poll()
3076 // Possible encodings:
3077 // 85 00 test %eax,(%rax)
3078 // 85 01 test %eax,(%rcx)
3079 // 85 02 test %eax,(%rdx)
3080 // 85 03 test %eax,(%rbx)
3081 // 85 06 test %eax,(%rsi)
3082 // 85 07 test %eax,(%rdi)
3083 //
3084 // 41 85 00 test %eax,(%r8)
3085 // 41 85 01 test %eax,(%r9)
3086 // 41 85 02 test %eax,(%r10)
3087 // 41 85 03 test %eax,(%r11)
3088 // 41 85 06 test %eax,(%r14)
3089 // 41 85 07 test %eax,(%r15)
3090 //
3091 // 85 04 24 test %eax,(%rsp)
3092 // 41 85 04 24 test %eax,(%r12)
3093 // 85 45 00 test %eax,0x0(%rbp)
3094 // 41 85 45 00 test %eax,0x0(%r13)
3095 //
3096 // Notes:
3097 // Format of legacy MAP0 test instruction:-
3098 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3099 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3100 // operand and base register of memory operand is b/w [0-8), hence we do not require
3101 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3102 // is why two bytes encoding is sufficient here.
3103 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3104 // register of memory operand is 1000, thus we need additional REX prefix in this case,
3105 // there by adding additional byte to instruction encoding.
3106 // o In case BASE register is one of the 32 extended GPR registers available only on targets
3107 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3108 // most significant two bits of 5 bit register encoding.
3109
3110 if (VM_Version::supports_apx_f()) {
3111 __ cmpb(Address(rbx, 0), Assembler::REX2);
3112 __ jccb(Assembler::notEqual, check_rex_prefix);
3113 __ addptr(rbx, 2);
3114 __ bind(check_rex_prefix);
3115 }
3116 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3117 __ jccb(Assembler::notEqual, no_prefix);
3118 __ addptr(rbx, 1);
3119 __ bind(no_prefix);
3120 #ifdef ASSERT
3121 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3122 #endif
3123 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3124 // r12/rsp 0x04
3125 // r13/rbp 0x05
3126 __ movzbq(rcx, Address(rbx, 1));
3127 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3128 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3129 __ cmpptr(rcx, 1);
3130 __ jccb(Assembler::above, not_special);
3131 __ addptr(rbx, 1);
3132 __ bind(not_special);
3133 #ifdef ASSERT
3134 // Verify the correct encoding of the poll we're about to skip.
3135 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3136 __ jcc(Assembler::notEqual, bail);
3137 // Mask out the modrm bits
3138 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3139 // rax encodes to 0, so if the bits are nonzero it's incorrect
3140 __ jcc(Assembler::notZero, bail);
3141 #endif
3142 // Adjust return pc forward to step over the safepoint poll instruction
3143 __ addptr(rbx, 2);
3144 __ movptr(Address(rbp, wordSize), rbx);
3145 }
3146
3147 __ bind(no_adjust);
3148 // Normal exit, restore registers and exit.
3149 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3150 __ ret(0);
3151
3152 #ifdef ASSERT
3153 __ bind(bail);
3154 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3155 #endif
3156
3157 // Make sure all code is generated
3158 masm->flush();
3159
3160 // Fill-out other meta info
3161 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3162
3163 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3164 return sp_blob;
3165 }
3166
3167 //
3168 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3169 //
3170 // Generate a stub that calls into vm to find out the proper destination
3171 // of a java call. All the argument registers are live at this point
3172 // but since this is generic code we don't know what they are and the caller
3173 // must do any gc of the args.
3174 //
3175 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3176 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3177 assert(is_resolve_id(id), "expected a resolve stub id");
3178
3179 const char* name = SharedRuntime::stub_name(id);
3180 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3181 if (blob != nullptr) {
3182 return blob->as_runtime_stub();
3183 }
3184
3185 // allocate space for the code
3186 ResourceMark rm;
3187 CodeBuffer buffer(name, 1552, 512);
3188 MacroAssembler* masm = new MacroAssembler(&buffer);
3189
3190 int frame_size_in_words;
3191
3192 OopMapSet *oop_maps = new OopMapSet();
3193 OopMap* map = nullptr;
3194
3195 int start = __ offset();
3196
3197 // No need to save vector registers since they are caller-saved anyway.
3198 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3199
3200 int frame_complete = __ offset();
3201
3202 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3203
3204 __ mov(c_rarg0, r15_thread);
3205
3206 __ call(RuntimeAddress(destination));
3207
3208
3209 // Set an oopmap for the call site.
3210 // We need this not only for callee-saved registers, but also for volatile
3211 // registers that the compiler might be keeping live across a safepoint.
3212
3213 oop_maps->add_gc_map( __ offset() - start, map);
3214
3215 // rax contains the address we are going to jump to assuming no exception got installed
3216
3217 // clear last_Java_sp
3218 __ reset_last_Java_frame(false);
3219 // check for pending exceptions
3220 Label pending;
3221 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3222 __ jcc(Assembler::notEqual, pending);
3223
3224 // get the returned Method*
3225 __ get_vm_result_metadata(rbx);
3226 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3227
3228 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3229
3230 RegisterSaver::restore_live_registers(masm);
3231
3232 // We are back to the original state on entry and ready to go.
3233
3234 __ jmp(rax);
3235
3236 // Pending exception after the safepoint
3237
3238 __ bind(pending);
3239
3240 RegisterSaver::restore_live_registers(masm);
3241
3242 // exception pending => remove activation and forward to exception handler
3243
3244 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3245
3246 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3247 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3248
3249 // -------------
3250 // make sure all code is generated
3251 masm->flush();
3252
3253 // return the blob
3254 // frame_size_words or bytes??
3255 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3256
3257 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3258 return rs_blob;
3259 }
3260
3261 // Continuation point for throwing of implicit exceptions that are
3262 // not handled in the current activation. Fabricates an exception
3263 // oop and initiates normal exception dispatching in this
3264 // frame. Since we need to preserve callee-saved values (currently
3265 // only for C2, but done for C1 as well) we need a callee-saved oop
3266 // map and therefore have to make these stubs into RuntimeStubs
3267 // rather than BufferBlobs. If the compiler needs all registers to
3268 // be preserved between the fault point and the exception handler
3269 // then it must assume responsibility for that in
3270 // AbstractCompiler::continuation_for_implicit_null_exception or
3271 // continuation_for_implicit_division_by_zero_exception. All other
3272 // implicit exceptions (e.g., NullPointerException or
3273 // AbstractMethodError on entry) are either at call sites or
3274 // otherwise assume that stack unwinding will be initiated, so
3275 // caller saved registers were assumed volatile in the compiler.
3276 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3277 assert(is_throw_id(id), "expected a throw stub id");
3278
3279 const char* name = SharedRuntime::stub_name(id);
3280
3281 // Information about frame layout at time of blocking runtime call.
3282 // Note that we only have to preserve callee-saved registers since
3283 // the compilers are responsible for supplying a continuation point
3284 // if they expect all registers to be preserved.
3285 enum layout {
3286 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3287 rbp_off2,
3288 return_off,
3289 return_off2,
3290 framesize // inclusive of return address
3291 };
3292
3293 int insts_size = 512;
3294 int locs_size = 64;
3295
3296 const char* timer_msg = "SharedRuntime generate_throw_exception";
3297 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3298
3299 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3300 if (blob != nullptr) {
3301 return blob->as_runtime_stub();
3302 }
3303
3304 ResourceMark rm;
3305 CodeBuffer code(name, insts_size, locs_size);
3306 OopMapSet* oop_maps = new OopMapSet();
3307 MacroAssembler* masm = new MacroAssembler(&code);
3308
3309 address start = __ pc();
3310
3311 // This is an inlined and slightly modified version of call_VM
3312 // which has the ability to fetch the return PC out of
3313 // thread-local storage and also sets up last_Java_sp slightly
3314 // differently than the real call_VM
3315
3316 __ enter(); // required for proper stackwalking of RuntimeStub frame
3317
3318 assert(is_even(framesize/2), "sp not 16-byte aligned");
3319
3320 // return address and rbp are already in place
3321 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3322
3323 int frame_complete = __ pc() - start;
3324
3325 // Set up last_Java_sp and last_Java_fp
3326 address the_pc = __ pc();
3327 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3328 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3329
3330 // Call runtime
3331 __ movptr(c_rarg0, r15_thread);
3332 BLOCK_COMMENT("call runtime_entry");
3333 __ call(RuntimeAddress(runtime_entry));
3334
3335 // Generate oop map
3336 OopMap* map = new OopMap(framesize, 0);
3337
3338 oop_maps->add_gc_map(the_pc - start, map);
3339
3340 __ reset_last_Java_frame(true);
3341
3342 __ leave(); // required for proper stackwalking of RuntimeStub frame
3343
3344 // check for pending exceptions
3345 #ifdef ASSERT
3346 Label L;
3347 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3348 __ jcc(Assembler::notEqual, L);
3349 __ should_not_reach_here();
3350 __ bind(L);
3351 #endif // ASSERT
3352 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3353
3354
3355 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3356 RuntimeStub* stub =
3357 RuntimeStub::new_runtime_stub(name,
3358 &code,
3359 frame_complete,
3360 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3361 oop_maps, false);
3362 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, (uint)id, name);
3363
3364 return stub;
3365 }
3366
3367 //------------------------------Montgomery multiplication------------------------
3368 //
3369
3370 #ifndef _WINDOWS
3371
3372 // Subtract 0:b from carry:a. Return carry.
3373 static julong
3374 sub(julong a[], julong b[], julong carry, long len) {
3375 long long i = 0, cnt = len;
3376 julong tmp;
3377 asm volatile("clc; "
3378 "0: ; "
3379 "mov (%[b], %[i], 8), %[tmp]; "
3380 "sbb %[tmp], (%[a], %[i], 8); "
3381 "inc %[i]; dec %[cnt]; "
3382 "jne 0b; "
3383 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3384 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3385 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3386 : "memory");
3387 return tmp;
3388 }
3389
3390 // Multiply (unsigned) Long A by Long B, accumulating the double-
3391 // length result into the accumulator formed of T0, T1, and T2.
3392 #define MACC(A, B, T0, T1, T2) \
3393 do { \
3394 unsigned long hi, lo; \
3395 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3396 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3397 : "r"(A), "a"(B) : "cc"); \
3398 } while(0)
3399
3400 // As above, but add twice the double-length result into the
3401 // accumulator.
3402 #define MACC2(A, B, T0, T1, T2) \
3403 do { \
3404 unsigned long hi, lo; \
3405 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3406 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3407 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3408 : "r"(A), "a"(B) : "cc"); \
3409 } while(0)
3410
3411 #else //_WINDOWS
3412
3413 static julong
3414 sub(julong a[], julong b[], julong carry, long len) {
3415 long i;
3416 julong tmp;
3417 unsigned char c = 1;
3418 for (i = 0; i < len; i++) {
3419 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3420 a[i] = tmp;
3421 }
3422 c = _addcarry_u64(c, carry, ~0, &tmp);
3423 return tmp;
3424 }
3425
3426 // Multiply (unsigned) Long A by Long B, accumulating the double-
3427 // length result into the accumulator formed of T0, T1, and T2.
3428 #define MACC(A, B, T0, T1, T2) \
3429 do { \
3430 julong hi, lo; \
3431 lo = _umul128(A, B, &hi); \
3432 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3433 c = _addcarry_u64(c, hi, T1, &T1); \
3434 _addcarry_u64(c, T2, 0, &T2); \
3435 } while(0)
3436
3437 // As above, but add twice the double-length result into the
3438 // accumulator.
3439 #define MACC2(A, B, T0, T1, T2) \
3440 do { \
3441 julong hi, lo; \
3442 lo = _umul128(A, B, &hi); \
3443 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3444 c = _addcarry_u64(c, hi, T1, &T1); \
3445 _addcarry_u64(c, T2, 0, &T2); \
3446 c = _addcarry_u64(0, lo, T0, &T0); \
3447 c = _addcarry_u64(c, hi, T1, &T1); \
3448 _addcarry_u64(c, T2, 0, &T2); \
3449 } while(0)
3450
3451 #endif //_WINDOWS
3452
3453 // Fast Montgomery multiplication. The derivation of the algorithm is
3454 // in A Cryptographic Library for the Motorola DSP56000,
3455 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3456
3457 static void NOINLINE
3458 montgomery_multiply(julong a[], julong b[], julong n[],
3459 julong m[], julong inv, int len) {
3460 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3461 int i;
3462
3463 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3464
3465 for (i = 0; i < len; i++) {
3466 int j;
3467 for (j = 0; j < i; j++) {
3468 MACC(a[j], b[i-j], t0, t1, t2);
3469 MACC(m[j], n[i-j], t0, t1, t2);
3470 }
3471 MACC(a[i], b[0], t0, t1, t2);
3472 m[i] = t0 * inv;
3473 MACC(m[i], n[0], t0, t1, t2);
3474
3475 assert(t0 == 0, "broken Montgomery multiply");
3476
3477 t0 = t1; t1 = t2; t2 = 0;
3478 }
3479
3480 for (i = len; i < 2*len; i++) {
3481 int j;
3482 for (j = i-len+1; j < len; j++) {
3483 MACC(a[j], b[i-j], t0, t1, t2);
3484 MACC(m[j], n[i-j], t0, t1, t2);
3485 }
3486 m[i-len] = t0;
3487 t0 = t1; t1 = t2; t2 = 0;
3488 }
3489
3490 while (t0)
3491 t0 = sub(m, n, t0, len);
3492 }
3493
3494 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3495 // multiplies so it should be up to 25% faster than Montgomery
3496 // multiplication. However, its loop control is more complex and it
3497 // may actually run slower on some machines.
3498
3499 static void NOINLINE
3500 montgomery_square(julong a[], julong n[],
3501 julong m[], julong inv, int len) {
3502 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3503 int i;
3504
3505 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3506
3507 for (i = 0; i < len; i++) {
3508 int j;
3509 int end = (i+1)/2;
3510 for (j = 0; j < end; j++) {
3511 MACC2(a[j], a[i-j], t0, t1, t2);
3512 MACC(m[j], n[i-j], t0, t1, t2);
3513 }
3514 if ((i & 1) == 0) {
3515 MACC(a[j], a[j], t0, t1, t2);
3516 }
3517 for (; j < i; j++) {
3518 MACC(m[j], n[i-j], t0, t1, t2);
3519 }
3520 m[i] = t0 * inv;
3521 MACC(m[i], n[0], t0, t1, t2);
3522
3523 assert(t0 == 0, "broken Montgomery square");
3524
3525 t0 = t1; t1 = t2; t2 = 0;
3526 }
3527
3528 for (i = len; i < 2*len; i++) {
3529 int start = i-len+1;
3530 int end = start + (len - start)/2;
3531 int j;
3532 for (j = start; j < end; j++) {
3533 MACC2(a[j], a[i-j], t0, t1, t2);
3534 MACC(m[j], n[i-j], t0, t1, t2);
3535 }
3536 if ((i & 1) == 0) {
3537 MACC(a[j], a[j], t0, t1, t2);
3538 }
3539 for (; j < len; j++) {
3540 MACC(m[j], n[i-j], t0, t1, t2);
3541 }
3542 m[i-len] = t0;
3543 t0 = t1; t1 = t2; t2 = 0;
3544 }
3545
3546 while (t0)
3547 t0 = sub(m, n, t0, len);
3548 }
3549
3550 // Swap words in a longword.
3551 static julong swap(julong x) {
3552 return (x << 32) | (x >> 32);
3553 }
3554
3555 // Copy len longwords from s to d, word-swapping as we go. The
3556 // destination array is reversed.
3557 static void reverse_words(julong *s, julong *d, int len) {
3558 d += len;
3559 while(len-- > 0) {
3560 d--;
3561 *d = swap(*s);
3562 s++;
3563 }
3564 }
3565
3566 // The threshold at which squaring is advantageous was determined
3567 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3568 #define MONTGOMERY_SQUARING_THRESHOLD 64
3569
3570 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3571 jint len, jlong inv,
3572 jint *m_ints) {
3573 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3574 int longwords = len/2;
3575
3576 // Make very sure we don't use so much space that the stack might
3577 // overflow. 512 jints corresponds to an 16384-bit integer and
3578 // will use here a total of 8k bytes of stack space.
3579 int divisor = sizeof(julong) * 4;
3580 guarantee(longwords <= 8192 / divisor, "must be");
3581 int total_allocation = longwords * sizeof (julong) * 4;
3582 julong *scratch = (julong *)alloca(total_allocation);
3583
3584 // Local scratch arrays
3585 julong
3586 *a = scratch + 0 * longwords,
3587 *b = scratch + 1 * longwords,
3588 *n = scratch + 2 * longwords,
3589 *m = scratch + 3 * longwords;
3590
3591 reverse_words((julong *)a_ints, a, longwords);
3592 reverse_words((julong *)b_ints, b, longwords);
3593 reverse_words((julong *)n_ints, n, longwords);
3594
3595 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3596
3597 reverse_words(m, (julong *)m_ints, longwords);
3598 }
3599
3600 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3601 jint len, jlong inv,
3602 jint *m_ints) {
3603 assert(len % 2 == 0, "array length in montgomery_square must be even");
3604 int longwords = len/2;
3605
3606 // Make very sure we don't use so much space that the stack might
3607 // overflow. 512 jints corresponds to an 16384-bit integer and
3608 // will use here a total of 6k bytes of stack space.
3609 int divisor = sizeof(julong) * 3;
3610 guarantee(longwords <= (8192 / divisor), "must be");
3611 int total_allocation = longwords * sizeof (julong) * 3;
3612 julong *scratch = (julong *)alloca(total_allocation);
3613
3614 // Local scratch arrays
3615 julong
3616 *a = scratch + 0 * longwords,
3617 *n = scratch + 1 * longwords,
3618 *m = scratch + 2 * longwords;
3619
3620 reverse_words((julong *)a_ints, a, longwords);
3621 reverse_words((julong *)n_ints, n, longwords);
3622
3623 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3624 ::montgomery_square(a, n, m, (julong)inv, longwords);
3625 } else {
3626 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3627 }
3628
3629 reverse_words(m, (julong *)m_ints, longwords);
3630 }
3631
3632 #if INCLUDE_JFR
3633
3634 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3635 // It returns a jobject handle to the event writer.
3636 // The handle is dereferenced and the return value is the event writer oop.
3637 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3638 enum layout {
3639 rbp_off,
3640 rbpH_off,
3641 return_off,
3642 return_off2,
3643 framesize // inclusive of return address
3644 };
3645
3646 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3647 CodeBuffer code(name, 1024, 64);
3648 MacroAssembler* masm = new MacroAssembler(&code);
3649 address start = __ pc();
3650
3651 __ enter();
3652 address the_pc = __ pc();
3653
3654 int frame_complete = the_pc - start;
3655
3656 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3657 __ movptr(c_rarg0, r15_thread);
3658 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3659 __ reset_last_Java_frame(true);
3660
3661 // rax is jobject handle result, unpack and process it through a barrier.
3662 __ resolve_global_jobject(rax, c_rarg0);
3663
3664 __ leave();
3665 __ ret(0);
3666
3667 OopMapSet* oop_maps = new OopMapSet();
3668 OopMap* map = new OopMap(framesize, 1);
3669 oop_maps->add_gc_map(frame_complete, map);
3670
3671 RuntimeStub* stub =
3672 RuntimeStub::new_runtime_stub(name,
3673 &code,
3674 frame_complete,
3675 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3676 oop_maps,
3677 false);
3678 return stub;
3679 }
3680
3681 // For c2: call to return a leased buffer.
3682 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3683 enum layout {
3684 rbp_off,
3685 rbpH_off,
3686 return_off,
3687 return_off2,
3688 framesize // inclusive of return address
3689 };
3690
3691 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3692 CodeBuffer code(name, 1024, 64);
3693 MacroAssembler* masm = new MacroAssembler(&code);
3694 address start = __ pc();
3695
3696 __ enter();
3697 address the_pc = __ pc();
3698
3699 int frame_complete = the_pc - start;
3700
3701 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3702 __ movptr(c_rarg0, r15_thread);
3703 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3704 __ reset_last_Java_frame(true);
3705
3706 __ leave();
3707 __ ret(0);
3708
3709 OopMapSet* oop_maps = new OopMapSet();
3710 OopMap* map = new OopMap(framesize, 1);
3711 oop_maps->add_gc_map(frame_complete, map);
3712
3713 RuntimeStub* stub =
3714 RuntimeStub::new_runtime_stub(name,
3715 &code,
3716 frame_complete,
3717 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3718 oop_maps,
3719 false);
3720 return stub;
3721 }
3722
3723 #endif // INCLUDE_JFR
3724
--- EOF ---