1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "classfile/symbolTable.hpp"
31 #include "code/aotCodeCache.hpp"
32 #include "code/compiledIC.hpp"
33 #include "code/debugInfoRec.hpp"
34 #include "code/nativeInst.hpp"
35 #include "code/vtableStubs.hpp"
36 #include "compiler/oopMap.hpp"
37 #include "gc/shared/collectedHeap.hpp"
38 #include "gc/shared/gcLocker.hpp"
39 #include "gc/shared/barrierSet.hpp"
40 #include "gc/shared/barrierSetAssembler.hpp"
41 #include "interpreter/interpreter.hpp"
42 #include "logging/log.hpp"
43 #include "memory/resourceArea.hpp"
44 #include "memory/universe.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "oops/method.inline.hpp"
47 #include "prims/methodHandles.hpp"
48 #include "runtime/continuation.hpp"
49 #include "runtime/continuationEntry.inline.hpp"
50 #include "runtime/globals.hpp"
51 #include "runtime/jniHandles.hpp"
52 #include "runtime/safepointMechanism.hpp"
53 #include "runtime/sharedRuntime.hpp"
54 #include "runtime/signature.hpp"
55 #include "runtime/stubRoutines.hpp"
56 #include "runtime/timerTrace.hpp"
57 #include "runtime/vframeArray.hpp"
58 #include "runtime/vm_version.hpp"
59 #include "utilities/align.hpp"
60 #include "utilities/checkedCast.hpp"
61 #include "utilities/formatBuffer.hpp"
62 #include "vmreg_x86.inline.hpp"
63 #ifdef COMPILER1
64 #include "c1/c1_Runtime1.hpp"
65 #endif
66 #ifdef COMPILER2
67 #include "opto/runtime.hpp"
68 #endif
69 #if INCLUDE_JVMCI
70 #include "jvmci/jvmciJavaClasses.hpp"
71 #endif
72
73 #define __ masm->
74
75 #ifdef PRODUCT
76 #define BLOCK_COMMENT(str) /* nothing */
77 #else
78 #define BLOCK_COMMENT(str) __ block_comment(str)
79 #endif // PRODUCT
80
81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
82
83 class RegisterSaver {
84 // Capture info about frame layout. Layout offsets are in jint
85 // units because compiler frame slots are jints.
86 #define XSAVE_AREA_BEGIN 160
87 #define XSAVE_AREA_YMM_BEGIN 576
88 #define XSAVE_AREA_EGPRS 960
89 #define XSAVE_AREA_OPMASK_BEGIN 1088
90 #define XSAVE_AREA_ZMM_BEGIN 1152
91 #define XSAVE_AREA_UPPERBANK 1664
92 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
93 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
94 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
95 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
97 enum layout {
98 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
99 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
100 DEF_XMM_OFFS(0),
101 DEF_XMM_OFFS(1),
102 // 2..15 are implied in range usage
103 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
104 DEF_YMM_OFFS(0),
105 DEF_YMM_OFFS(1),
106 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
107 r16H_off,
108 r17_off, r17H_off,
109 r18_off, r18H_off,
110 r19_off, r19H_off,
111 r20_off, r20H_off,
112 r21_off, r21H_off,
113 r22_off, r22H_off,
114 r23_off, r23H_off,
115 r24_off, r24H_off,
116 r25_off, r25H_off,
117 r26_off, r26H_off,
118 r27_off, r27H_off,
119 r28_off, r28H_off,
120 r29_off, r29H_off,
121 r30_off, r30H_off,
122 r31_off, r31H_off,
123 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
124 DEF_OPMASK_OFFS(0),
125 DEF_OPMASK_OFFS(1),
126 // 2..7 are implied in range usage
127 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
128 DEF_ZMM_OFFS(0),
129 DEF_ZMM_OFFS(1),
130 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
131 DEF_ZMM_UPPER_OFFS(16),
132 DEF_ZMM_UPPER_OFFS(17),
133 // 18..31 are implied in range usage
134 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
135 fpu_stateH_end,
136 r15_off, r15H_off,
137 r14_off, r14H_off,
138 r13_off, r13H_off,
139 r12_off, r12H_off,
140 r11_off, r11H_off,
141 r10_off, r10H_off,
142 r9_off, r9H_off,
143 r8_off, r8H_off,
144 rdi_off, rdiH_off,
145 rsi_off, rsiH_off,
146 ignore_off, ignoreH_off, // extra copy of rbp
147 rsp_off, rspH_off,
148 rbx_off, rbxH_off,
149 rdx_off, rdxH_off,
150 rcx_off, rcxH_off,
151 rax_off, raxH_off,
152 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
153 align_off, alignH_off,
154 flags_off, flagsH_off,
155 // The frame sender code expects that rbp will be in the "natural" place and
156 // will override any oopMap setting for it. We must therefore force the layout
157 // so that it agrees with the frame sender code.
158 rbp_off, rbpH_off, // copy of rbp we will restore
159 return_off, returnH_off, // slot for return address
160 reg_save_size // size in compiler stack slots
161 };
162
163 public:
164 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
165 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
166
167 // Offsets into the register save area
168 // Used by deoptimization when it is managing result register
169 // values on its own
170
171 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
172 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
173 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
174 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
175 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
176 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
177
178 // During deoptimization only the result registers need to be restored,
179 // all the other values have already been extracted.
180 static void restore_result_registers(MacroAssembler* masm);
181 };
182
183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
184 int off = 0;
185 int num_xmm_regs = XMMRegister::available_xmm_registers();
186 #if COMPILER2_OR_JVMCI
187 if (save_wide_vectors && UseAVX == 0) {
188 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
189 }
190 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
191 #else
192 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
193 #endif
194
195 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
196 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
197 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
198 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
199 // CodeBlob frame size is in words.
200 int frame_size_in_words = frame_size_in_bytes / wordSize;
201 *total_frame_words = frame_size_in_words;
202
203 // Save registers, fpu state, and flags.
204 // We assume caller has already pushed the return address onto the
205 // stack, so rsp is 8-byte aligned here.
206 // We push rpb twice in this sequence because we want the real rbp
207 // to be under the return like a normal enter.
208
209 __ enter(); // rsp becomes 16-byte aligned here
210 __ pushf();
211 // Make sure rsp stays 16-byte aligned
212 __ subq(rsp, 8);
213 // Push CPU state in multiple of 16 bytes
214 __ save_legacy_gprs();
215 __ push_FPU_state();
216
217
218 // push cpu state handles this on EVEX enabled targets
219 if (save_wide_vectors) {
220 // Save upper half of YMM registers(0..15)
221 int base_addr = XSAVE_AREA_YMM_BEGIN;
222 for (int n = 0; n < 16; n++) {
223 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
224 }
225 if (VM_Version::supports_evex()) {
226 // Save upper half of ZMM registers(0..15)
227 base_addr = XSAVE_AREA_ZMM_BEGIN;
228 for (int n = 0; n < 16; n++) {
229 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
230 }
231 // Save full ZMM registers(16..num_xmm_regs)
232 base_addr = XSAVE_AREA_UPPERBANK;
233 off = 0;
234 int vector_len = Assembler::AVX_512bit;
235 for (int n = 16; n < num_xmm_regs; n++) {
236 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
237 }
238 #if COMPILER2_OR_JVMCI
239 base_addr = XSAVE_AREA_OPMASK_BEGIN;
240 off = 0;
241 for(int n = 0; n < KRegister::number_of_registers; n++) {
242 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
243 }
244 #endif
245 }
246 } else {
247 if (VM_Version::supports_evex()) {
248 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
249 int base_addr = XSAVE_AREA_UPPERBANK;
250 off = 0;
251 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
252 for (int n = 16; n < num_xmm_regs; n++) {
253 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
254 }
255 #if COMPILER2_OR_JVMCI
256 base_addr = XSAVE_AREA_OPMASK_BEGIN;
257 off = 0;
258 for(int n = 0; n < KRegister::number_of_registers; n++) {
259 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
260 }
261 #endif
262 }
263 }
264
265 #if COMPILER2_OR_JVMCI
266 if (UseAPX) {
267 int base_addr = XSAVE_AREA_EGPRS;
268 off = 0;
269 for (int n = 16; n < Register::number_of_registers; n++) {
270 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
271 }
272 }
273 #endif
274
275 __ vzeroupper();
276 if (frame::arg_reg_save_area_bytes != 0) {
277 // Allocate argument register save area
278 __ subptr(rsp, frame::arg_reg_save_area_bytes);
279 }
280
281 // Set an oopmap for the call site. This oopmap will map all
282 // oop-registers and debug-info registers as callee-saved. This
283 // will allow deoptimization at this safepoint to find all possible
284 // debug-info recordings, as well as let GC find all oops.
285
286 OopMapSet *oop_maps = new OopMapSet();
287 OopMap* map = new OopMap(frame_size_in_slots, 0);
288
289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
290
291 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
294 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
295 // rbp location is known implicitly by the frame sender code, needs no oopmap
296 // and the location where rbp was saved by is ignored
297 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
306 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
307
308 if (UseAPX) {
309 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
324 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
325 }
326 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
327 // on EVEX enabled targets, we get it included in the xsave area
328 off = xmm0_off;
329 int delta = xmm1_off - off;
330 for (int n = 0; n < 16; n++) {
331 XMMRegister xmm_name = as_XMMRegister(n);
332 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
333 off += delta;
334 }
335 if (UseAVX > 2) {
336 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
337 off = zmm16_off;
338 delta = zmm17_off - off;
339 for (int n = 16; n < num_xmm_regs; n++) {
340 XMMRegister zmm_name = as_XMMRegister(n);
341 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
342 off += delta;
343 }
344 }
345
346 #if COMPILER2_OR_JVMCI
347 if (save_wide_vectors) {
348 // Save upper half of YMM registers(0..15)
349 off = ymm0_off;
350 delta = ymm1_off - ymm0_off;
351 for (int n = 0; n < 16; n++) {
352 XMMRegister ymm_name = as_XMMRegister(n);
353 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
354 off += delta;
355 }
356 if (VM_Version::supports_evex()) {
357 // Save upper half of ZMM registers(0..15)
358 off = zmm0_off;
359 delta = zmm1_off - zmm0_off;
360 for (int n = 0; n < 16; n++) {
361 XMMRegister zmm_name = as_XMMRegister(n);
362 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
363 off += delta;
364 }
365 }
366 }
367 #endif // COMPILER2_OR_JVMCI
368
369 // %%% These should all be a waste but we'll keep things as they were for now
370 if (true) {
371 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
374 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
375 // rbp location is known implicitly by the frame sender code, needs no oopmap
376 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
385 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
386 if (UseAPX) {
387 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
402 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
403 }
404 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
405 // on EVEX enabled targets, we get it included in the xsave area
406 off = xmm0H_off;
407 delta = xmm1H_off - off;
408 for (int n = 0; n < 16; n++) {
409 XMMRegister xmm_name = as_XMMRegister(n);
410 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
411 off += delta;
412 }
413 if (UseAVX > 2) {
414 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
415 off = zmm16H_off;
416 delta = zmm17H_off - off;
417 for (int n = 16; n < num_xmm_regs; n++) {
418 XMMRegister zmm_name = as_XMMRegister(n);
419 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
420 off += delta;
421 }
422 }
423 }
424
425 return map;
426 }
427
428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
429 int num_xmm_regs = XMMRegister::available_xmm_registers();
430 if (frame::arg_reg_save_area_bytes != 0) {
431 // Pop arg register save area
432 __ addptr(rsp, frame::arg_reg_save_area_bytes);
433 }
434
435 #if COMPILER2_OR_JVMCI
436 if (restore_wide_vectors) {
437 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
438 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
439 }
440 #else
441 assert(!restore_wide_vectors, "vectors are generated only by C2");
442 #endif
443
444 __ vzeroupper();
445
446 // On EVEX enabled targets everything is handled in pop fpu state
447 if (restore_wide_vectors) {
448 // Restore upper half of YMM registers (0..15)
449 int base_addr = XSAVE_AREA_YMM_BEGIN;
450 for (int n = 0; n < 16; n++) {
451 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
452 }
453 if (VM_Version::supports_evex()) {
454 // Restore upper half of ZMM registers (0..15)
455 base_addr = XSAVE_AREA_ZMM_BEGIN;
456 for (int n = 0; n < 16; n++) {
457 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
458 }
459 // Restore full ZMM registers(16..num_xmm_regs)
460 base_addr = XSAVE_AREA_UPPERBANK;
461 int vector_len = Assembler::AVX_512bit;
462 int off = 0;
463 for (int n = 16; n < num_xmm_regs; n++) {
464 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
465 }
466 #if COMPILER2_OR_JVMCI
467 base_addr = XSAVE_AREA_OPMASK_BEGIN;
468 off = 0;
469 for (int n = 0; n < KRegister::number_of_registers; n++) {
470 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
471 }
472 #endif
473 }
474 } else {
475 if (VM_Version::supports_evex()) {
476 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
477 int base_addr = XSAVE_AREA_UPPERBANK;
478 int off = 0;
479 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
480 for (int n = 16; n < num_xmm_regs; n++) {
481 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
482 }
483 #if COMPILER2_OR_JVMCI
484 base_addr = XSAVE_AREA_OPMASK_BEGIN;
485 off = 0;
486 for (int n = 0; n < KRegister::number_of_registers; n++) {
487 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
488 }
489 #endif
490 }
491 }
492
493 #if COMPILER2_OR_JVMCI
494 if (UseAPX) {
495 int base_addr = XSAVE_AREA_EGPRS;
496 int off = 0;
497 for (int n = 16; n < Register::number_of_registers; n++) {
498 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
499 }
500 }
501 #endif
502
503 // Recover CPU state
504 __ pop_FPU_state();
505 __ restore_legacy_gprs();
506 __ addq(rsp, 8);
507 __ popf();
508 // Get the rbp described implicitly by the calling convention (no oopMap)
509 __ pop(rbp);
510 }
511
512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
513
514 // Just restore result register. Only used by deoptimization. By
515 // now any callee save register that needs to be restored to a c2
516 // caller of the deoptee has been extracted into the vframeArray
517 // and will be stuffed into the c2i adapter we create for later
518 // restoration so only result registers need to be restored here.
519
520 // Restore fp result register
521 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
522 // Restore integer result register
523 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
524 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
525
526 // Pop all of the register save are off the stack except the return address
527 __ addptr(rsp, return_offset_in_bytes());
528 }
529
530 // Is vector's size (in bytes) bigger than a size saved by default?
531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
532 bool SharedRuntime::is_wide_vector(int size) {
533 return size > 16;
534 }
535
536 // ---------------------------------------------------------------------------
537 // Read the array of BasicTypes from a signature, and compute where the
538 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
539 // quantities. Values less than VMRegImpl::stack0 are registers, those above
540 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
541 // as framesizes are fixed.
542 // VMRegImpl::stack0 refers to the first slot 0(sp).
543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
544 // Register up to Register::number_of_registers are the 64-bit
545 // integer registers.
546
547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
548 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
549 // units regardless of build. Of course for i486 there is no 64 bit build
550
551 // The Java calling convention is a "shifted" version of the C ABI.
552 // By skipping the first C ABI register we can call non-static jni methods
553 // with small numbers of arguments without having to shuffle the arguments
554 // at all. Since we control the java ABI we ought to at least get some
555 // advantage out of it.
556
557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
558 VMRegPair *regs,
559 int total_args_passed) {
560
561 // Create the mapping between argument positions and
562 // registers.
563 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
564 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
565 };
566 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
567 j_farg0, j_farg1, j_farg2, j_farg3,
568 j_farg4, j_farg5, j_farg6, j_farg7
569 };
570
571
572 uint int_args = 0;
573 uint fp_args = 0;
574 uint stk_args = 0;
575
576 for (int i = 0; i < total_args_passed; i++) {
577 switch (sig_bt[i]) {
578 case T_BOOLEAN:
579 case T_CHAR:
580 case T_BYTE:
581 case T_SHORT:
582 case T_INT:
583 if (int_args < Argument::n_int_register_parameters_j) {
584 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
585 } else {
586 stk_args = align_up(stk_args, 2);
587 regs[i].set1(VMRegImpl::stack2reg(stk_args));
588 stk_args += 1;
589 }
590 break;
591 case T_VOID:
592 // halves of T_LONG or T_DOUBLE
593 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
594 regs[i].set_bad();
595 break;
596 case T_LONG:
597 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
598 // fall through
599 case T_OBJECT:
600 case T_ARRAY:
601 case T_ADDRESS:
602 if (int_args < Argument::n_int_register_parameters_j) {
603 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
604 } else {
605 stk_args = align_up(stk_args, 2);
606 regs[i].set2(VMRegImpl::stack2reg(stk_args));
607 stk_args += 2;
608 }
609 break;
610 case T_FLOAT:
611 if (fp_args < Argument::n_float_register_parameters_j) {
612 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
613 } else {
614 stk_args = align_up(stk_args, 2);
615 regs[i].set1(VMRegImpl::stack2reg(stk_args));
616 stk_args += 1;
617 }
618 break;
619 case T_DOUBLE:
620 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
621 if (fp_args < Argument::n_float_register_parameters_j) {
622 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
623 } else {
624 stk_args = align_up(stk_args, 2);
625 regs[i].set2(VMRegImpl::stack2reg(stk_args));
626 stk_args += 2;
627 }
628 break;
629 default:
630 ShouldNotReachHere();
631 break;
632 }
633 }
634
635 return stk_args;
636 }
637
638 // Same as java_calling_convention() but for multiple return
639 // values. There's no way to store them on the stack so if we don't
640 // have enough registers, multiple values can't be returned.
641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
644 VMRegPair *regs,
645 int total_args_passed) {
646 // Create the mapping between argument positions and
647 // registers.
648 static const Register INT_ArgReg[java_return_convention_max_int] = {
649 rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
650 };
651 static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
652 j_farg0, j_farg1, j_farg2, j_farg3,
653 j_farg4, j_farg5, j_farg6, j_farg7
654 };
655
656
657 uint int_args = 0;
658 uint fp_args = 0;
659
660 for (int i = 0; i < total_args_passed; i++) {
661 switch (sig_bt[i]) {
662 case T_BOOLEAN:
663 case T_CHAR:
664 case T_BYTE:
665 case T_SHORT:
666 case T_INT:
667 if (int_args < Argument::n_int_register_parameters_j+1) {
668 regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
669 int_args++;
670 } else {
671 return -1;
672 }
673 break;
674 case T_VOID:
675 // halves of T_LONG or T_DOUBLE
676 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
677 regs[i].set_bad();
678 break;
679 case T_LONG:
680 assert(sig_bt[i + 1] == T_VOID, "expecting half");
681 // fall through
682 case T_OBJECT:
683 case T_ARRAY:
684 case T_ADDRESS:
685 case T_METADATA:
686 if (int_args < Argument::n_int_register_parameters_j+1) {
687 regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
688 int_args++;
689 } else {
690 return -1;
691 }
692 break;
693 case T_FLOAT:
694 if (fp_args < Argument::n_float_register_parameters_j) {
695 regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
696 fp_args++;
697 } else {
698 return -1;
699 }
700 break;
701 case T_DOUBLE:
702 assert(sig_bt[i + 1] == T_VOID, "expecting half");
703 if (fp_args < Argument::n_float_register_parameters_j) {
704 regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
705 fp_args++;
706 } else {
707 return -1;
708 }
709 break;
710 default:
711 ShouldNotReachHere();
712 break;
713 }
714 }
715
716 return int_args + fp_args;
717 }
718
719 // Patch the callers callsite with entry to compiled code if it exists.
720 static void patch_callers_callsite(MacroAssembler *masm) {
721 Label L;
722 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
723 __ jcc(Assembler::equal, L);
724
725 // Save the current stack pointer
726 __ mov(r13, rsp);
727 // Schedule the branch target address early.
728 // Call into the VM to patch the caller, then jump to compiled callee
729 // rax isn't live so capture return address while we easily can
730 __ movptr(rax, Address(rsp, 0));
731
732 // align stack so push_CPU_state doesn't fault
733 __ andptr(rsp, -(StackAlignmentInBytes));
734 __ push_CPU_state();
735 __ vzeroupper();
736 // VM needs caller's callsite
737 // VM needs target method
738 // This needs to be a long call since we will relocate this adapter to
739 // the codeBuffer and it may not reach
740
741 // Allocate argument register save area
742 if (frame::arg_reg_save_area_bytes != 0) {
743 __ subptr(rsp, frame::arg_reg_save_area_bytes);
744 }
745 __ mov(c_rarg0, rbx);
746 __ mov(c_rarg1, rax);
747 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
748
749 // De-allocate argument register save area
750 if (frame::arg_reg_save_area_bytes != 0) {
751 __ addptr(rsp, frame::arg_reg_save_area_bytes);
752 }
753
754 __ vzeroupper();
755 __ pop_CPU_state();
756 // restore sp
757 __ mov(rsp, r13);
758 __ bind(L);
759 }
760
761 // For each inline type argument, sig includes the list of fields of
762 // the inline type. This utility function computes the number of
763 // arguments for the call if inline types are passed by reference (the
764 // calling convention the interpreter expects).
765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
766 int total_args_passed = 0;
767 if (InlineTypePassFieldsAsArgs) {
768 for (int i = 0; i < sig_extended->length(); i++) {
769 BasicType bt = sig_extended->at(i)._bt;
770 if (bt == T_METADATA) {
771 // In sig_extended, an inline type argument starts with:
772 // T_METADATA, followed by the types of the fields of the
773 // inline type and T_VOID to mark the end of the value
774 // type. Inline types are flattened so, for instance, in the
775 // case of an inline type with an int field and an inline type
776 // field that itself has 2 fields, an int and a long:
777 // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
778 // slot for the T_LONG) T_VOID (inner inline type) T_VOID
779 // (outer inline type)
780 total_args_passed++;
781 int vt = 1;
782 do {
783 i++;
784 BasicType bt = sig_extended->at(i)._bt;
785 BasicType prev_bt = sig_extended->at(i-1)._bt;
786 if (bt == T_METADATA) {
787 vt++;
788 } else if (bt == T_VOID &&
789 prev_bt != T_LONG &&
790 prev_bt != T_DOUBLE) {
791 vt--;
792 }
793 } while (vt != 0);
794 } else {
795 total_args_passed++;
796 }
797 }
798 } else {
799 total_args_passed = sig_extended->length();
800 }
801 return total_args_passed;
802 }
803
804
805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
806 BasicType bt,
807 BasicType prev_bt,
808 size_t size_in_bytes,
809 const VMRegPair& reg_pair,
810 const Address& to,
811 int extraspace,
812 bool is_oop) {
813 if (bt == T_VOID) {
814 assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
815 return;
816 }
817
818 // Say 4 args:
819 // i st_off
820 // 0 32 T_LONG
821 // 1 24 T_VOID
822 // 2 16 T_OBJECT
823 // 3 8 T_BOOL
824 // - 0 return address
825 //
826 // However to make thing extra confusing. Because we can fit a long/double in
827 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
828 // leaves one slot empty and only stores to a single slot. In this case the
829 // slot that is occupied is the T_VOID slot. See I said it was confusing.
830
831 bool wide = (size_in_bytes == wordSize);
832 VMReg r_1 = reg_pair.first();
833 VMReg r_2 = reg_pair.second();
834 assert(r_2->is_valid() == wide, "invalid size");
835 if (!r_1->is_valid()) {
836 assert(!r_2->is_valid(), "must be invalid");
837 return;
838 }
839
840 if (!r_1->is_XMMRegister()) {
841 Register val = rax;
842 if (r_1->is_stack()) {
843 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
844 __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
845 } else {
846 val = r_1->as_Register();
847 }
848 assert_different_registers(to.base(), val, rscratch1);
849 if (is_oop) {
850 __ push(r13);
851 __ push(rbx);
852 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
853 __ push(to.base());
854 __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
855 __ pop(to.base());
856 __ pop(rbx);
857 __ pop(r13);
858 } else {
859 __ store_sized_value(to, val, size_in_bytes);
860 }
861 } else {
862 if (wide) {
863 __ movdbl(to, r_1->as_XMMRegister());
864 } else {
865 __ movflt(to, r_1->as_XMMRegister());
866 }
867 }
868 }
869
870 static void gen_c2i_adapter(MacroAssembler *masm,
871 const GrowableArray<SigEntry>* sig_extended,
872 const VMRegPair *regs,
873 bool requires_clinit_barrier,
874 address& c2i_no_clinit_check_entry,
875 Label& skip_fixup,
876 address start,
877 OopMapSet* oop_maps,
878 int& frame_complete,
879 int& frame_size_in_words,
880 bool alloc_inline_receiver) {
881 if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
882 Label L_skip_barrier;
883 Register method = rbx;
884
885 { // Bypass the barrier for non-static methods
886 Register flags = rscratch1;
887 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
888 __ testl(flags, JVM_ACC_STATIC);
889 __ jcc(Assembler::zero, L_skip_barrier); // non-static
890 }
891
892 Register klass = rscratch1;
893 __ load_method_holder(klass, method);
894 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
895
896 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
897
898 __ bind(L_skip_barrier);
899 c2i_no_clinit_check_entry = __ pc();
900 }
901
902 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
903 bs->c2i_entry_barrier(masm);
904
905 // Before we get into the guts of the C2I adapter, see if we should be here
906 // at all. We've come from compiled code and are attempting to jump to the
907 // interpreter, which means the caller made a static call to get here
908 // (vcalls always get a compiled target if there is one). Check for a
909 // compiled target. If there is one, we need to patch the caller's call.
910 patch_callers_callsite(masm);
911
912 __ bind(skip_fixup);
913
914 if (InlineTypePassFieldsAsArgs) {
915 // Is there an inline type argument?
916 bool has_inline_argument = false;
917 for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
918 has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
919 }
920 if (has_inline_argument) {
921 // There is at least an inline type argument: we're coming from
922 // compiled code so we have no buffers to back the inline types.
923 // Allocate the buffers here with a runtime call.
924 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
925
926 frame_complete = __ offset();
927
928 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
929
930 __ mov(c_rarg0, r15_thread);
931 __ mov(c_rarg1, rbx);
932 __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
933 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
934
935 oop_maps->add_gc_map((int)(__ pc() - start), map);
936 __ reset_last_Java_frame(false);
937
938 RegisterSaver::restore_live_registers(masm);
939
940 Label no_exception;
941 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
942 __ jcc(Assembler::equal, no_exception);
943
944 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
945 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
946 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
947
948 __ bind(no_exception);
949
950 // We get an array of objects from the runtime call
951 __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
952 __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
953 }
954 }
955
956 // Since all args are passed on the stack, total_args_passed *
957 // Interpreter::stackElementSize is the space we need.
958 int total_args_passed = compute_total_args_passed_int(sig_extended);
959 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
960
961 int extraspace = (total_args_passed * Interpreter::stackElementSize);
962
963 // stack is aligned, keep it that way
964 // This is not currently needed or enforced by the interpreter, but
965 // we might as well conform to the ABI.
966 extraspace = align_up(extraspace, 2*wordSize);
967
968 // set senderSP value
969 __ lea(r13, Address(rsp, wordSize));
970
971 #ifdef ASSERT
972 __ check_stack_alignment(r13, "sender stack not aligned");
973 #endif
974 if (extraspace > 0) {
975 // Pop the return address
976 __ pop(rax);
977
978 __ subptr(rsp, extraspace);
979
980 // Push the return address
981 __ push(rax);
982
983 // Account for the return address location since we store it first rather
984 // than hold it in a register across all the shuffling
985 extraspace += wordSize;
986 }
987
988 #ifdef ASSERT
989 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
990 #endif
991
992 // Now write the args into the outgoing interpreter space
993
994 // next_arg_comp is the next argument from the compiler point of
995 // view (inline type fields are passed in registers/on the stack). In
996 // sig_extended, an inline type argument starts with: T_METADATA,
997 // followed by the types of the fields of the inline type and T_VOID
998 // to mark the end of the inline type. ignored counts the number of
999 // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1000 // used to get the buffer for that argument from the pool of buffers
1001 // we allocated above and want to pass to the
1002 // interpreter. next_arg_int is the next argument from the
1003 // interpreter point of view (inline types are passed by reference).
1004 for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1005 next_arg_comp < sig_extended->length(); next_arg_comp++) {
1006 assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1007 assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1008 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1009 int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1010 if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1011 int next_off = st_off - Interpreter::stackElementSize;
1012 const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1013 const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1014 size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1015 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1016 size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1017 next_arg_int++;
1018 #ifdef ASSERT
1019 if (bt == T_LONG || bt == T_DOUBLE) {
1020 // Overwrite the unused slot with known junk
1021 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1022 __ movptr(Address(rsp, st_off), rax);
1023 }
1024 #endif /* ASSERT */
1025 } else {
1026 ignored++;
1027 // get the buffer from the just allocated pool of buffers
1028 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1029 __ load_heap_oop(r14, Address(rscratch2, index));
1030 next_vt_arg++; next_arg_int++;
1031 int vt = 1;
1032 // write fields we get from compiled code in registers/stack
1033 // slots to the buffer: we know we are done with that inline type
1034 // argument when we hit the T_VOID that acts as an end of inline
1035 // type delimiter for this inline type. Inline types are flattened
1036 // so we might encounter embedded inline types. Each entry in
1037 // sig_extended contains a field offset in the buffer.
1038 Label L_null;
1039 do {
1040 next_arg_comp++;
1041 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1042 BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1043 if (bt == T_METADATA) {
1044 vt++;
1045 ignored++;
1046 } else if (bt == T_VOID &&
1047 prev_bt != T_LONG &&
1048 prev_bt != T_DOUBLE) {
1049 vt--;
1050 ignored++;
1051 } else {
1052 int off = sig_extended->at(next_arg_comp)._offset;
1053 if (off == -1) {
1054 // Nullable inline type argument, emit null check
1055 VMReg reg = regs[next_arg_comp-ignored].first();
1056 Label L_notNull;
1057 if (reg->is_stack()) {
1058 int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1059 __ testb(Address(rsp, ld_off), 1);
1060 } else {
1061 __ testb(reg->as_Register(), 1);
1062 }
1063 __ jcc(Assembler::notZero, L_notNull);
1064 __ movptr(Address(rsp, st_off), 0);
1065 __ jmp(L_null);
1066 __ bind(L_notNull);
1067 continue;
1068 }
1069 assert(off > 0, "offset in object should be positive");
1070 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1071 bool is_oop = is_reference_type(bt);
1072 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1073 size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1074 }
1075 } while (vt != 0);
1076 // pass the buffer to the interpreter
1077 __ movptr(Address(rsp, st_off), r14);
1078 __ bind(L_null);
1079 }
1080 }
1081
1082 // Schedule the branch target address early.
1083 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1084 __ jmp(rcx);
1085 }
1086
1087 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1088 int comp_args_on_stack,
1089 const GrowableArray<SigEntry>* sig,
1090 const VMRegPair *regs) {
1091
1092 // Note: r13 contains the senderSP on entry. We must preserve it since
1093 // we may do a i2c -> c2i transition if we lose a race where compiled
1094 // code goes non-entrant while we get args ready.
1095 // In addition we use r13 to locate all the interpreter args as
1096 // we must align the stack to 16 bytes on an i2c entry else we
1097 // lose alignment we expect in all compiled code and register
1098 // save code can segv when fxsave instructions find improperly
1099 // aligned stack pointer.
1100
1101 // Adapters can be frameless because they do not require the caller
1102 // to perform additional cleanup work, such as correcting the stack pointer.
1103 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1104 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1105 // even if a callee has modified the stack pointer.
1106 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1107 // routinely repairs its caller's stack pointer (from sender_sp, which is set
1108 // up via the senderSP register).
1109 // In other words, if *either* the caller or callee is interpreted, we can
1110 // get the stack pointer repaired after a call.
1111 // This is why c2i and i2c adapters cannot be indefinitely composed.
1112 // In particular, if a c2i adapter were to somehow call an i2c adapter,
1113 // both caller and callee would be compiled methods, and neither would
1114 // clean up the stack pointer changes performed by the two adapters.
1115 // If this happens, control eventually transfers back to the compiled
1116 // caller, but with an uncorrected stack, causing delayed havoc.
1117
1118 // Must preserve original SP for loading incoming arguments because
1119 // we need to align the outgoing SP for compiled code.
1120 __ movptr(r11, rsp);
1121
1122 // Pick up the return address
1123 __ pop(rax);
1124
1125 // Convert 4-byte c2 stack slots to words.
1126 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1127
1128 if (comp_args_on_stack) {
1129 __ subptr(rsp, comp_words_on_stack * wordSize);
1130 }
1131
1132 // Ensure compiled code always sees stack at proper alignment
1133 __ andptr(rsp, -16);
1134
1135 // push the return address and misalign the stack that youngest frame always sees
1136 // as far as the placement of the call instruction
1137 __ push(rax);
1138
1139 // Put saved SP in another register
1140 const Register saved_sp = rax;
1141 __ movptr(saved_sp, r11);
1142
1143 // Will jump to the compiled code just as if compiled code was doing it.
1144 // Pre-load the register-jump target early, to schedule it better.
1145 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1146
1147 #if INCLUDE_JVMCI
1148 if (EnableJVMCI) {
1149 // check if this call should be routed towards a specific entry point
1150 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1151 Label no_alternative_target;
1152 __ jcc(Assembler::equal, no_alternative_target);
1153 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1154 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1155 __ bind(no_alternative_target);
1156 }
1157 #endif // INCLUDE_JVMCI
1158
1159 int total_args_passed = sig->length();
1160
1161 // Now generate the shuffle code. Pick up all register args and move the
1162 // rest through the floating point stack top.
1163 for (int i = 0; i < total_args_passed; i++) {
1164 BasicType bt = sig->at(i)._bt;
1165 if (bt == T_VOID) {
1166 // Longs and doubles are passed in native word order, but misaligned
1167 // in the 32-bit build.
1168 BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1169 assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1170 continue;
1171 }
1172
1173 // Pick up 0, 1 or 2 words from SP+offset.
1174
1175 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1176 "scrambled load targets?");
1177 // Load in argument order going down.
1178 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1179 // Point to interpreter value (vs. tag)
1180 int next_off = ld_off - Interpreter::stackElementSize;
1181 //
1182 //
1183 //
1184 VMReg r_1 = regs[i].first();
1185 VMReg r_2 = regs[i].second();
1186 if (!r_1->is_valid()) {
1187 assert(!r_2->is_valid(), "");
1188 continue;
1189 }
1190 if (r_1->is_stack()) {
1191 // Convert stack slot to an SP offset (+ wordSize to account for return address )
1192 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1193
1194 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1195 // and if we end up going thru a c2i because of a miss a reasonable value of r13
1196 // will be generated.
1197 if (!r_2->is_valid()) {
1198 // sign extend???
1199 __ movl(r13, Address(saved_sp, ld_off));
1200 __ movptr(Address(rsp, st_off), r13);
1201 } else {
1202 //
1203 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1204 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1205 // So we must adjust where to pick up the data to match the interpreter.
1206 //
1207 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1208 // are accessed as negative so LSW is at LOW address
1209
1210 // ld_off is MSW so get LSW
1211 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1212 next_off : ld_off;
1213 __ movq(r13, Address(saved_sp, offset));
1214 // st_off is LSW (i.e. reg.first())
1215 __ movq(Address(rsp, st_off), r13);
1216 }
1217 } else if (r_1->is_Register()) { // Register argument
1218 Register r = r_1->as_Register();
1219 assert(r != rax, "must be different");
1220 if (r_2->is_valid()) {
1221 //
1222 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1223 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1224 // So we must adjust where to pick up the data to match the interpreter.
1225
1226 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1227 next_off : ld_off;
1228
1229 // this can be a misaligned move
1230 __ movq(r, Address(saved_sp, offset));
1231 } else {
1232 // sign extend and use a full word?
1233 __ movl(r, Address(saved_sp, ld_off));
1234 }
1235 } else {
1236 if (!r_2->is_valid()) {
1237 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1238 } else {
1239 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1240 }
1241 }
1242 }
1243
1244 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1245
1246 // 6243940 We might end up in handle_wrong_method if
1247 // the callee is deoptimized as we race thru here. If that
1248 // happens we don't want to take a safepoint because the
1249 // caller frame will look interpreted and arguments are now
1250 // "compiled" so it is much better to make this transition
1251 // invisible to the stack walking code. Unfortunately if
1252 // we try and find the callee by normal means a safepoint
1253 // is possible. So we stash the desired callee in the thread
1254 // and the vm will find there should this case occur.
1255
1256 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1257
1258 // put Method* where a c2i would expect should we end up there
1259 // only needed because of c2 resolve stubs return Method* as a result in
1260 // rax
1261 __ mov(rax, rbx);
1262 __ jmp(r11);
1263 }
1264
1265 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1266 Register data = rax;
1267 __ ic_check(1 /* end_alignment */);
1268 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1269
1270 // Method might have been compiled since the call site was patched to
1271 // interpreted if that is the case treat it as a miss so we can get
1272 // the call site corrected.
1273 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1274 __ jcc(Assembler::equal, skip_fixup);
1275 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1276 }
1277
1278 // ---------------------------------------------------------------
1279 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1280 int comp_args_on_stack,
1281 const GrowableArray<SigEntry>* sig,
1282 const VMRegPair* regs,
1283 const GrowableArray<SigEntry>* sig_cc,
1284 const VMRegPair* regs_cc,
1285 const GrowableArray<SigEntry>* sig_cc_ro,
1286 const VMRegPair* regs_cc_ro,
1287 address entry_address[AdapterBlob::ENTRY_COUNT],
1288 AdapterBlob*& new_adapter,
1289 bool allocate_code_blob) {
1290 entry_address[AdapterBlob::I2C] = __ pc();
1291 gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1292
1293 // -------------------------------------------------------------------------
1294 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1295 // to the interpreter. The args start out packed in the compiled layout. They
1296 // need to be unpacked into the interpreter layout. This will almost always
1297 // require some stack space. We grow the current (compiled) stack, then repack
1298 // the args. We finally end in a jump to the generic interpreter entry point.
1299 // On exit from the interpreter, the interpreter will restore our SP (lest the
1300 // compiled code, which relies solely on SP and not RBP, get sick).
1301
1302 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1303 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1304 Label skip_fixup;
1305
1306 gen_inline_cache_check(masm, skip_fixup);
1307
1308 OopMapSet* oop_maps = new OopMapSet();
1309 int frame_complete = CodeOffsets::frame_never_safe;
1310 int frame_size_in_words = 0;
1311
1312 // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1313 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1314 entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1315 if (regs_cc != regs_cc_ro) {
1316 // No class init barrier needed because method is guaranteed to be non-static
1317 gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1318 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1319 skip_fixup.reset();
1320 }
1321
1322 // Scalarized c2i adapter
1323 entry_address[AdapterBlob::C2I] = __ pc();
1324 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1325 gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1326 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1327
1328 // Non-scalarized c2i adapter
1329 if (regs != regs_cc) {
1330 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1331 Label inline_entry_skip_fixup;
1332 gen_inline_cache_check(masm, inline_entry_skip_fixup);
1333
1334 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1335 gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1336 inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1337 }
1338
1339 // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1340 // the GC knows about the location of oop argument locations passed to the c2i adapter.
1341 if (allocate_code_blob) {
1342 bool caller_must_gc_arguments = (regs != regs_cc);
1343 int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1344 assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1345 AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1346 new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1347 }
1348 }
1349
1350 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1351 VMRegPair *regs,
1352 int total_args_passed) {
1353
1354 // We return the amount of VMRegImpl stack slots we need to reserve for all
1355 // the arguments NOT counting out_preserve_stack_slots.
1356
1357 // NOTE: These arrays will have to change when c1 is ported
1358 #ifdef _WIN64
1359 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1360 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1361 };
1362 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1363 c_farg0, c_farg1, c_farg2, c_farg3
1364 };
1365 #else
1366 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1367 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1368 };
1369 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1370 c_farg0, c_farg1, c_farg2, c_farg3,
1371 c_farg4, c_farg5, c_farg6, c_farg7
1372 };
1373 #endif // _WIN64
1374
1375
1376 uint int_args = 0;
1377 uint fp_args = 0;
1378 uint stk_args = 0; // inc by 2 each time
1379
1380 for (int i = 0; i < total_args_passed; i++) {
1381 switch (sig_bt[i]) {
1382 case T_BOOLEAN:
1383 case T_CHAR:
1384 case T_BYTE:
1385 case T_SHORT:
1386 case T_INT:
1387 if (int_args < Argument::n_int_register_parameters_c) {
1388 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1389 #ifdef _WIN64
1390 fp_args++;
1391 // Allocate slots for callee to stuff register args the stack.
1392 stk_args += 2;
1393 #endif
1394 } else {
1395 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1396 stk_args += 2;
1397 }
1398 break;
1399 case T_LONG:
1400 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1401 // fall through
1402 case T_OBJECT:
1403 case T_ARRAY:
1404 case T_ADDRESS:
1405 case T_METADATA:
1406 if (int_args < Argument::n_int_register_parameters_c) {
1407 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1408 #ifdef _WIN64
1409 fp_args++;
1410 stk_args += 2;
1411 #endif
1412 } else {
1413 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1414 stk_args += 2;
1415 }
1416 break;
1417 case T_FLOAT:
1418 if (fp_args < Argument::n_float_register_parameters_c) {
1419 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1420 #ifdef _WIN64
1421 int_args++;
1422 // Allocate slots for callee to stuff register args the stack.
1423 stk_args += 2;
1424 #endif
1425 } else {
1426 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1427 stk_args += 2;
1428 }
1429 break;
1430 case T_DOUBLE:
1431 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1432 if (fp_args < Argument::n_float_register_parameters_c) {
1433 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1434 #ifdef _WIN64
1435 int_args++;
1436 // Allocate slots for callee to stuff register args the stack.
1437 stk_args += 2;
1438 #endif
1439 } else {
1440 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1441 stk_args += 2;
1442 }
1443 break;
1444 case T_VOID: // Halves of longs and doubles
1445 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1446 regs[i].set_bad();
1447 break;
1448 default:
1449 ShouldNotReachHere();
1450 break;
1451 }
1452 }
1453 #ifdef _WIN64
1454 // windows abi requires that we always allocate enough stack space
1455 // for 4 64bit registers to be stored down.
1456 if (stk_args < 8) {
1457 stk_args = 8;
1458 }
1459 #endif // _WIN64
1460
1461 return stk_args;
1462 }
1463
1464 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1465 uint num_bits,
1466 uint total_args_passed) {
1467 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1468 "only certain vector sizes are supported for now");
1469
1470 static const XMMRegister VEC_ArgReg[32] = {
1471 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1472 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1473 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1474 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1475 };
1476
1477 uint stk_args = 0;
1478 uint fp_args = 0;
1479
1480 for (uint i = 0; i < total_args_passed; i++) {
1481 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1482 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1483 regs[i].set_pair(vmreg->next(next_val), vmreg);
1484 }
1485
1486 return stk_args;
1487 }
1488
1489 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1490 // We always ignore the frame_slots arg and just use the space just below frame pointer
1491 // which by this time is free to use
1492 switch (ret_type) {
1493 case T_FLOAT:
1494 __ movflt(Address(rbp, -wordSize), xmm0);
1495 break;
1496 case T_DOUBLE:
1497 __ movdbl(Address(rbp, -wordSize), xmm0);
1498 break;
1499 case T_VOID: break;
1500 default: {
1501 __ movptr(Address(rbp, -wordSize), rax);
1502 }
1503 }
1504 }
1505
1506 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1507 // We always ignore the frame_slots arg and just use the space just below frame pointer
1508 // which by this time is free to use
1509 switch (ret_type) {
1510 case T_FLOAT:
1511 __ movflt(xmm0, Address(rbp, -wordSize));
1512 break;
1513 case T_DOUBLE:
1514 __ movdbl(xmm0, Address(rbp, -wordSize));
1515 break;
1516 case T_VOID: break;
1517 default: {
1518 __ movptr(rax, Address(rbp, -wordSize));
1519 }
1520 }
1521 }
1522
1523 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1524 for ( int i = first_arg ; i < arg_count ; i++ ) {
1525 if (args[i].first()->is_Register()) {
1526 __ push(args[i].first()->as_Register());
1527 } else if (args[i].first()->is_XMMRegister()) {
1528 __ subptr(rsp, 2*wordSize);
1529 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1530 }
1531 }
1532 }
1533
1534 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1535 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1536 if (args[i].first()->is_Register()) {
1537 __ pop(args[i].first()->as_Register());
1538 } else if (args[i].first()->is_XMMRegister()) {
1539 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1540 __ addptr(rsp, 2*wordSize);
1541 }
1542 }
1543 }
1544
1545 static void verify_oop_args(MacroAssembler* masm,
1546 const methodHandle& method,
1547 const BasicType* sig_bt,
1548 const VMRegPair* regs) {
1549 Register temp_reg = rbx; // not part of any compiled calling seq
1550 if (VerifyOops) {
1551 for (int i = 0; i < method->size_of_parameters(); i++) {
1552 if (is_reference_type(sig_bt[i])) {
1553 VMReg r = regs[i].first();
1554 assert(r->is_valid(), "bad oop arg");
1555 if (r->is_stack()) {
1556 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1557 __ verify_oop(temp_reg);
1558 } else {
1559 __ verify_oop(r->as_Register());
1560 }
1561 }
1562 }
1563 }
1564 }
1565
1566 static void check_continuation_enter_argument(VMReg actual_vmreg,
1567 Register expected_reg,
1568 const char* name) {
1569 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1570 assert(actual_vmreg->as_Register() == expected_reg,
1571 "%s is in unexpected register: %s instead of %s",
1572 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1573 }
1574
1575
1576 //---------------------------- continuation_enter_setup ---------------------------
1577 //
1578 // Arguments:
1579 // None.
1580 //
1581 // Results:
1582 // rsp: pointer to blank ContinuationEntry
1583 //
1584 // Kills:
1585 // rax
1586 //
1587 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1588 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1589 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1590 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1591
1592 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1593 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1594
1595 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1596 OopMap* map = new OopMap(frame_size, 0);
1597
1598 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1599 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1600 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1601
1602 return map;
1603 }
1604
1605 //---------------------------- fill_continuation_entry ---------------------------
1606 //
1607 // Arguments:
1608 // rsp: pointer to blank Continuation entry
1609 // reg_cont_obj: pointer to the continuation
1610 // reg_flags: flags
1611 //
1612 // Results:
1613 // rsp: pointer to filled out ContinuationEntry
1614 //
1615 // Kills:
1616 // rax
1617 //
1618 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1619 assert_different_registers(rax, reg_cont_obj, reg_flags);
1620 #ifdef ASSERT
1621 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1622 #endif
1623 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1624 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1625 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1626 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1627 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1628
1629 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1630 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1631
1632 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1633 }
1634
1635 //---------------------------- continuation_enter_cleanup ---------------------------
1636 //
1637 // Arguments:
1638 // rsp: pointer to the ContinuationEntry
1639 //
1640 // Results:
1641 // rsp: pointer to the spilled rbp in the entry frame
1642 //
1643 // Kills:
1644 // rbx
1645 //
1646 static void continuation_enter_cleanup(MacroAssembler* masm) {
1647 #ifdef ASSERT
1648 Label L_good_sp;
1649 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1650 __ jcc(Assembler::equal, L_good_sp);
1651 __ stop("Incorrect rsp at continuation_enter_cleanup");
1652 __ bind(L_good_sp);
1653 #endif
1654 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1655 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1656 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1657 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1658 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1659 }
1660
1661 static void gen_continuation_enter(MacroAssembler* masm,
1662 const VMRegPair* regs,
1663 int& exception_offset,
1664 OopMapSet* oop_maps,
1665 int& frame_complete,
1666 int& stack_slots,
1667 int& interpreted_entry_offset,
1668 int& compiled_entry_offset) {
1669
1670 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1671 int pos_cont_obj = 0;
1672 int pos_is_cont = 1;
1673 int pos_is_virtual = 2;
1674
1675 // The platform-specific calling convention may present the arguments in various registers.
1676 // To simplify the rest of the code, we expect the arguments to reside at these known
1677 // registers, and we additionally check the placement here in case calling convention ever
1678 // changes.
1679 Register reg_cont_obj = c_rarg1;
1680 Register reg_is_cont = c_rarg2;
1681 Register reg_is_virtual = c_rarg3;
1682
1683 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1684 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1685 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1686
1687 // Utility methods kill rax, make sure there are no collisions
1688 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1689
1690 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1691 relocInfo::static_call_type);
1692
1693 address start = __ pc();
1694
1695 Label L_thaw, L_exit;
1696
1697 // i2i entry used at interp_only_mode only
1698 interpreted_entry_offset = __ pc() - start;
1699 {
1700 #ifdef ASSERT
1701 Label is_interp_only;
1702 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1703 __ jcc(Assembler::notEqual, is_interp_only);
1704 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1705 __ bind(is_interp_only);
1706 #endif
1707
1708 __ pop(rax); // return address
1709 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1710 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1711 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1712 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1713 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1714 __ push(rax); // return address
1715 __ push_cont_fastpath();
1716
1717 __ enter();
1718
1719 stack_slots = 2; // will be adjusted in setup
1720 OopMap* map = continuation_enter_setup(masm, stack_slots);
1721 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1722 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1723
1724 __ verify_oop(reg_cont_obj);
1725
1726 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1727
1728 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1729 __ testptr(reg_is_cont, reg_is_cont);
1730 __ jcc(Assembler::notZero, L_thaw);
1731
1732 // --- Resolve path
1733
1734 // Make sure the call is patchable
1735 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1736 // Emit stub for static call
1737 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1738 if (stub == nullptr) {
1739 fatal("CodeCache is full at gen_continuation_enter");
1740 }
1741 __ call(resolve);
1742 oop_maps->add_gc_map(__ pc() - start, map);
1743 __ post_call_nop();
1744
1745 __ jmp(L_exit);
1746 }
1747
1748 // compiled entry
1749 __ align(CodeEntryAlignment);
1750 compiled_entry_offset = __ pc() - start;
1751 __ enter();
1752
1753 stack_slots = 2; // will be adjusted in setup
1754 OopMap* map = continuation_enter_setup(masm, stack_slots);
1755
1756 // Frame is now completed as far as size and linkage.
1757 frame_complete = __ pc() - start;
1758
1759 __ verify_oop(reg_cont_obj);
1760
1761 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1762
1763 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1764 __ testptr(reg_is_cont, reg_is_cont);
1765 __ jccb(Assembler::notZero, L_thaw);
1766
1767 // --- call Continuation.enter(Continuation c, boolean isContinue)
1768
1769 // Make sure the call is patchable
1770 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1771
1772 // Emit stub for static call
1773 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1774 if (stub == nullptr) {
1775 fatal("CodeCache is full at gen_continuation_enter");
1776 }
1777
1778 // The call needs to be resolved. There's a special case for this in
1779 // SharedRuntime::find_callee_info_helper() which calls
1780 // LinkResolver::resolve_continuation_enter() which resolves the call to
1781 // Continuation.enter(Continuation c, boolean isContinue).
1782 __ call(resolve);
1783
1784 oop_maps->add_gc_map(__ pc() - start, map);
1785 __ post_call_nop();
1786
1787 __ jmpb(L_exit);
1788
1789 // --- Thawing path
1790
1791 __ bind(L_thaw);
1792
1793 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1794 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1795
1796 ContinuationEntry::_return_pc_offset = __ pc() - start;
1797 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1798 __ post_call_nop();
1799
1800 // --- Normal exit (resolve/thawing)
1801
1802 __ bind(L_exit);
1803 ContinuationEntry::_cleanup_offset = __ pc() - start;
1804 continuation_enter_cleanup(masm);
1805 __ pop(rbp);
1806 __ ret(0);
1807
1808 // --- Exception handling path
1809
1810 exception_offset = __ pc() - start;
1811
1812 continuation_enter_cleanup(masm);
1813 __ pop(rbp);
1814
1815 __ movptr(c_rarg0, r15_thread);
1816 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1817
1818 // rax still holds the original exception oop, save it before the call
1819 __ push(rax);
1820
1821 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1822 __ movptr(rbx, rax);
1823
1824 // Continue at exception handler:
1825 // rax: exception oop
1826 // rbx: exception handler
1827 // rdx: exception pc
1828 __ pop(rax);
1829 __ verify_oop(rax);
1830 __ pop(rdx);
1831 __ jmp(rbx);
1832 }
1833
1834 static void gen_continuation_yield(MacroAssembler* masm,
1835 const VMRegPair* regs,
1836 OopMapSet* oop_maps,
1837 int& frame_complete,
1838 int& stack_slots,
1839 int& compiled_entry_offset) {
1840 enum layout {
1841 rbp_off,
1842 rbpH_off,
1843 return_off,
1844 return_off2,
1845 framesize // inclusive of return address
1846 };
1847 stack_slots = framesize / VMRegImpl::slots_per_word;
1848 assert(stack_slots == 2, "recheck layout");
1849
1850 address start = __ pc();
1851 compiled_entry_offset = __ pc() - start;
1852 __ enter();
1853 address the_pc = __ pc();
1854
1855 frame_complete = the_pc - start;
1856
1857 // This nop must be exactly at the PC we push into the frame info.
1858 // We use this nop for fast CodeBlob lookup, associate the OopMap
1859 // with it right away.
1860 __ post_call_nop();
1861 OopMap* map = new OopMap(framesize, 1);
1862 oop_maps->add_gc_map(frame_complete, map);
1863
1864 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1865 __ movptr(c_rarg0, r15_thread);
1866 __ movptr(c_rarg1, rsp);
1867 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1868 __ reset_last_Java_frame(true);
1869
1870 Label L_pinned;
1871
1872 __ testptr(rax, rax);
1873 __ jcc(Assembler::notZero, L_pinned);
1874
1875 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1876 continuation_enter_cleanup(masm);
1877 __ pop(rbp);
1878 __ ret(0);
1879
1880 __ bind(L_pinned);
1881
1882 // Pinned, return to caller
1883
1884 // handle pending exception thrown by freeze
1885 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1886 Label ok;
1887 __ jcc(Assembler::equal, ok);
1888 __ leave();
1889 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1890 __ bind(ok);
1891
1892 __ leave();
1893 __ ret(0);
1894 }
1895
1896 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1897 ::continuation_enter_cleanup(masm);
1898 }
1899
1900 static void gen_special_dispatch(MacroAssembler* masm,
1901 const methodHandle& method,
1902 const BasicType* sig_bt,
1903 const VMRegPair* regs) {
1904 verify_oop_args(masm, method, sig_bt, regs);
1905 vmIntrinsics::ID iid = method->intrinsic_id();
1906
1907 // Now write the args into the outgoing interpreter space
1908 bool has_receiver = false;
1909 Register receiver_reg = noreg;
1910 int member_arg_pos = -1;
1911 Register member_reg = noreg;
1912 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1913 if (ref_kind != 0) {
1914 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1915 member_reg = rbx; // known to be free at this point
1916 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1917 } else if (iid == vmIntrinsics::_invokeBasic) {
1918 has_receiver = true;
1919 } else if (iid == vmIntrinsics::_linkToNative) {
1920 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1921 member_reg = rbx; // known to be free at this point
1922 } else {
1923 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1924 }
1925
1926 if (member_reg != noreg) {
1927 // Load the member_arg into register, if necessary.
1928 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1929 VMReg r = regs[member_arg_pos].first();
1930 if (r->is_stack()) {
1931 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1932 } else {
1933 // no data motion is needed
1934 member_reg = r->as_Register();
1935 }
1936 }
1937
1938 if (has_receiver) {
1939 // Make sure the receiver is loaded into a register.
1940 assert(method->size_of_parameters() > 0, "oob");
1941 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1942 VMReg r = regs[0].first();
1943 assert(r->is_valid(), "bad receiver arg");
1944 if (r->is_stack()) {
1945 // Porting note: This assumes that compiled calling conventions always
1946 // pass the receiver oop in a register. If this is not true on some
1947 // platform, pick a temp and load the receiver from stack.
1948 fatal("receiver always in a register");
1949 receiver_reg = j_rarg0; // known to be free at this point
1950 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1951 } else {
1952 // no data motion is needed
1953 receiver_reg = r->as_Register();
1954 }
1955 }
1956
1957 // Figure out which address we are really jumping to:
1958 MethodHandles::generate_method_handle_dispatch(masm, iid,
1959 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1960 }
1961
1962 // ---------------------------------------------------------------------------
1963 // Generate a native wrapper for a given method. The method takes arguments
1964 // in the Java compiled code convention, marshals them to the native
1965 // convention (handlizes oops, etc), transitions to native, makes the call,
1966 // returns to java state (possibly blocking), unhandlizes any result and
1967 // returns.
1968 //
1969 // Critical native functions are a shorthand for the use of
1970 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1971 // functions. The wrapper is expected to unpack the arguments before
1972 // passing them to the callee. Critical native functions leave the state _in_Java,
1973 // since they cannot stop for GC.
1974 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1975 // block and the check for pending exceptions it's impossible for them
1976 // to be thrown.
1977 //
1978 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1979 const methodHandle& method,
1980 int compile_id,
1981 BasicType* in_sig_bt,
1982 VMRegPair* in_regs,
1983 BasicType ret_type) {
1984 if (method->is_continuation_native_intrinsic()) {
1985 int exception_offset = -1;
1986 OopMapSet* oop_maps = new OopMapSet();
1987 int frame_complete = -1;
1988 int stack_slots = -1;
1989 int interpreted_entry_offset = -1;
1990 int vep_offset = -1;
1991 if (method->is_continuation_enter_intrinsic()) {
1992 gen_continuation_enter(masm,
1993 in_regs,
1994 exception_offset,
1995 oop_maps,
1996 frame_complete,
1997 stack_slots,
1998 interpreted_entry_offset,
1999 vep_offset);
2000 } else if (method->is_continuation_yield_intrinsic()) {
2001 gen_continuation_yield(masm,
2002 in_regs,
2003 oop_maps,
2004 frame_complete,
2005 stack_slots,
2006 vep_offset);
2007 } else {
2008 guarantee(false, "Unknown Continuation native intrinsic");
2009 }
2010
2011 #ifdef ASSERT
2012 if (method->is_continuation_enter_intrinsic()) {
2013 assert(interpreted_entry_offset != -1, "Must be set");
2014 assert(exception_offset != -1, "Must be set");
2015 } else {
2016 assert(interpreted_entry_offset == -1, "Must be unset");
2017 assert(exception_offset == -1, "Must be unset");
2018 }
2019 assert(frame_complete != -1, "Must be set");
2020 assert(stack_slots != -1, "Must be set");
2021 assert(vep_offset != -1, "Must be set");
2022 #endif
2023
2024 __ flush();
2025 nmethod* nm = nmethod::new_native_nmethod(method,
2026 compile_id,
2027 masm->code(),
2028 vep_offset,
2029 frame_complete,
2030 stack_slots,
2031 in_ByteSize(-1),
2032 in_ByteSize(-1),
2033 oop_maps,
2034 exception_offset);
2035 if (nm == nullptr) return nm;
2036 if (method->is_continuation_enter_intrinsic()) {
2037 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2038 } else if (method->is_continuation_yield_intrinsic()) {
2039 _cont_doYield_stub = nm;
2040 }
2041 return nm;
2042 }
2043
2044 if (method->is_method_handle_intrinsic()) {
2045 vmIntrinsics::ID iid = method->intrinsic_id();
2046 intptr_t start = (intptr_t)__ pc();
2047 int vep_offset = ((intptr_t)__ pc()) - start;
2048 gen_special_dispatch(masm,
2049 method,
2050 in_sig_bt,
2051 in_regs);
2052 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
2053 __ flush();
2054 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
2055 return nmethod::new_native_nmethod(method,
2056 compile_id,
2057 masm->code(),
2058 vep_offset,
2059 frame_complete,
2060 stack_slots / VMRegImpl::slots_per_word,
2061 in_ByteSize(-1),
2062 in_ByteSize(-1),
2063 nullptr);
2064 }
2065 address native_func = method->native_function();
2066 assert(native_func != nullptr, "must have function");
2067
2068 // An OopMap for lock (and class if static)
2069 OopMapSet *oop_maps = new OopMapSet();
2070 intptr_t start = (intptr_t)__ pc();
2071
2072 // We have received a description of where all the java arg are located
2073 // on entry to the wrapper. We need to convert these args to where
2074 // the jni function will expect them. To figure out where they go
2075 // we convert the java signature to a C signature by inserting
2076 // the hidden arguments as arg[0] and possibly arg[1] (static method)
2077
2078 const int total_in_args = method->size_of_parameters();
2079 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2080
2081 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2082 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2083
2084 int argc = 0;
2085 out_sig_bt[argc++] = T_ADDRESS;
2086 if (method->is_static()) {
2087 out_sig_bt[argc++] = T_OBJECT;
2088 }
2089
2090 for (int i = 0; i < total_in_args ; i++ ) {
2091 out_sig_bt[argc++] = in_sig_bt[i];
2092 }
2093
2094 // Now figure out where the args must be stored and how much stack space
2095 // they require.
2096 int out_arg_slots;
2097 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2098
2099 // Compute framesize for the wrapper. We need to handlize all oops in
2100 // incoming registers
2101
2102 // Calculate the total number of stack slots we will need.
2103
2104 // First count the abi requirement plus all of the outgoing args
2105 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2106
2107 // Now the space for the inbound oop handle area
2108 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
2109
2110 int oop_handle_offset = stack_slots;
2111 stack_slots += total_save_slots;
2112
2113 // Now any space we need for handlizing a klass if static method
2114
2115 int klass_slot_offset = 0;
2116 int klass_offset = -1;
2117 int lock_slot_offset = 0;
2118 bool is_static = false;
2119
2120 if (method->is_static()) {
2121 klass_slot_offset = stack_slots;
2122 stack_slots += VMRegImpl::slots_per_word;
2123 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2124 is_static = true;
2125 }
2126
2127 // Plus a lock if needed
2128
2129 if (method->is_synchronized()) {
2130 lock_slot_offset = stack_slots;
2131 stack_slots += VMRegImpl::slots_per_word;
2132 }
2133
2134 // Now a place (+2) to save return values or temp during shuffling
2135 // + 4 for return address (which we own) and saved rbp
2136 stack_slots += 6;
2137
2138 // Ok The space we have allocated will look like:
2139 //
2140 //
2141 // FP-> | |
2142 // |---------------------|
2143 // | 2 slots for moves |
2144 // |---------------------|
2145 // | lock box (if sync) |
2146 // |---------------------| <- lock_slot_offset
2147 // | klass (if static) |
2148 // |---------------------| <- klass_slot_offset
2149 // | oopHandle area |
2150 // |---------------------| <- oop_handle_offset (6 java arg registers)
2151 // | outbound memory |
2152 // | based arguments |
2153 // | |
2154 // |---------------------|
2155 // | |
2156 // SP-> | out_preserved_slots |
2157 //
2158 //
2159
2160
2161 // Now compute actual number of stack words we need rounding to make
2162 // stack properly aligned.
2163 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2164
2165 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2166
2167 // First thing make an ic check to see if we should even be here
2168
2169 // We are free to use all registers as temps without saving them and
2170 // restoring them except rbp. rbp is the only callee save register
2171 // as far as the interpreter and the compiler(s) are concerned.
2172
2173 const Register receiver = j_rarg0;
2174
2175 Label exception_pending;
2176
2177 assert_different_registers(receiver, rscratch1, rscratch2);
2178 __ verify_oop(receiver);
2179 __ ic_check(8 /* end_alignment */);
2180
2181 int vep_offset = ((intptr_t)__ pc()) - start;
2182
2183 if (method->needs_clinit_barrier()) {
2184 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2185 Label L_skip_barrier;
2186 Register klass = r10;
2187 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2188 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2189
2190 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2191
2192 __ bind(L_skip_barrier);
2193 }
2194
2195 #ifdef COMPILER1
2196 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2197 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2198 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2199 }
2200 #endif // COMPILER1
2201
2202 // The instruction at the verified entry point must be 5 bytes or longer
2203 // because it can be patched on the fly by make_non_entrant. The stack bang
2204 // instruction fits that requirement.
2205
2206 // Generate stack overflow check
2207 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2208
2209 // Generate a new frame for the wrapper.
2210 __ enter();
2211 // -2 because return address is already present and so is saved rbp
2212 __ subptr(rsp, stack_size - 2*wordSize);
2213
2214 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2215 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2216 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2217
2218 // Frame is now completed as far as size and linkage.
2219 int frame_complete = ((intptr_t)__ pc()) - start;
2220
2221 #ifdef ASSERT
2222 __ check_stack_alignment(rsp, "improperly aligned stack");
2223 #endif /* ASSERT */
2224
2225
2226 // We use r14 as the oop handle for the receiver/klass
2227 // It is callee save so it survives the call to native
2228
2229 const Register oop_handle_reg = r14;
2230
2231 //
2232 // We immediately shuffle the arguments so that any vm call we have to
2233 // make from here on out (sync slow path, jvmti, etc.) we will have
2234 // captured the oops from our caller and have a valid oopMap for
2235 // them.
2236
2237 // -----------------
2238 // The Grand Shuffle
2239
2240 // The Java calling convention is either equal (linux) or denser (win64) than the
2241 // c calling convention. However the because of the jni_env argument the c calling
2242 // convention always has at least one more (and two for static) arguments than Java.
2243 // Therefore if we move the args from java -> c backwards then we will never have
2244 // a register->register conflict and we don't have to build a dependency graph
2245 // and figure out how to break any cycles.
2246 //
2247
2248 // Record esp-based slot for receiver on stack for non-static methods
2249 int receiver_offset = -1;
2250
2251 // This is a trick. We double the stack slots so we can claim
2252 // the oops in the caller's frame. Since we are sure to have
2253 // more args than the caller doubling is enough to make
2254 // sure we can capture all the incoming oop args from the
2255 // caller.
2256 //
2257 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2258
2259 // Mark location of rbp (someday)
2260 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2261
2262 // Use eax, ebx as temporaries during any memory-memory moves we have to do
2263 // All inbound args are referenced based on rbp and all outbound args via rsp.
2264
2265
2266 #ifdef ASSERT
2267 bool reg_destroyed[Register::number_of_registers];
2268 bool freg_destroyed[XMMRegister::number_of_registers];
2269 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2270 reg_destroyed[r] = false;
2271 }
2272 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2273 freg_destroyed[f] = false;
2274 }
2275
2276 #endif /* ASSERT */
2277
2278 // For JNI natives the incoming and outgoing registers are offset upwards.
2279 GrowableArray<int> arg_order(2 * total_in_args);
2280
2281 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2282 arg_order.push(i);
2283 arg_order.push(c_arg);
2284 }
2285
2286 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2287 int i = arg_order.at(ai);
2288 int c_arg = arg_order.at(ai + 1);
2289 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2290 #ifdef ASSERT
2291 if (in_regs[i].first()->is_Register()) {
2292 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2293 } else if (in_regs[i].first()->is_XMMRegister()) {
2294 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2295 }
2296 if (out_regs[c_arg].first()->is_Register()) {
2297 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2298 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2299 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2300 }
2301 #endif /* ASSERT */
2302 switch (in_sig_bt[i]) {
2303 case T_ARRAY:
2304 case T_OBJECT:
2305 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2306 ((i == 0) && (!is_static)),
2307 &receiver_offset);
2308 break;
2309 case T_VOID:
2310 break;
2311
2312 case T_FLOAT:
2313 __ float_move(in_regs[i], out_regs[c_arg]);
2314 break;
2315
2316 case T_DOUBLE:
2317 assert( i + 1 < total_in_args &&
2318 in_sig_bt[i + 1] == T_VOID &&
2319 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2320 __ double_move(in_regs[i], out_regs[c_arg]);
2321 break;
2322
2323 case T_LONG :
2324 __ long_move(in_regs[i], out_regs[c_arg]);
2325 break;
2326
2327 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2328
2329 default:
2330 __ move32_64(in_regs[i], out_regs[c_arg]);
2331 }
2332 }
2333
2334 int c_arg;
2335
2336 // Pre-load a static method's oop into r14. Used both by locking code and
2337 // the normal JNI call code.
2338 // point c_arg at the first arg that is already loaded in case we
2339 // need to spill before we call out
2340 c_arg = total_c_args - total_in_args;
2341
2342 if (method->is_static()) {
2343
2344 // load oop into a register
2345 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2346
2347 // Now handlize the static class mirror it's known not-null.
2348 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2349 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2350
2351 // Now get the handle
2352 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2353 // store the klass handle as second argument
2354 __ movptr(c_rarg1, oop_handle_reg);
2355 // and protect the arg if we must spill
2356 c_arg--;
2357 }
2358
2359 // Change state to native (we save the return address in the thread, since it might not
2360 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2361 // points into the right code segment. It does not have to be the correct return pc.
2362 // We use the same pc/oopMap repeatedly when we call out
2363
2364 Label native_return;
2365 if (method->is_object_wait0()) {
2366 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2367 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2368 } else {
2369 intptr_t the_pc = (intptr_t) __ pc();
2370 oop_maps->add_gc_map(the_pc - start, map);
2371
2372 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2373 }
2374
2375 // We have all of the arguments setup at this point. We must not touch any register
2376 // argument registers at this point (what if we save/restore them there are no oop?
2377
2378 if (DTraceMethodProbes) {
2379 // protect the args we've loaded
2380 save_args(masm, total_c_args, c_arg, out_regs);
2381 __ mov_metadata(c_rarg1, method());
2382 __ call_VM_leaf(
2383 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2384 r15_thread, c_rarg1);
2385 restore_args(masm, total_c_args, c_arg, out_regs);
2386 }
2387
2388 // RedefineClasses() tracing support for obsolete method entry
2389 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2390 // protect the args we've loaded
2391 save_args(masm, total_c_args, c_arg, out_regs);
2392 __ mov_metadata(c_rarg1, method());
2393 __ call_VM_leaf(
2394 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2395 r15_thread, c_rarg1);
2396 restore_args(masm, total_c_args, c_arg, out_regs);
2397 }
2398
2399 // Lock a synchronized method
2400
2401 // Register definitions used by locking and unlocking
2402
2403 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2404 const Register obj_reg = rbx; // Will contain the oop
2405 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2406
2407 Label slow_path_lock;
2408 Label lock_done;
2409
2410 if (method->is_synchronized()) {
2411 // Get the handle (the 2nd argument)
2412 __ mov(oop_handle_reg, c_rarg1);
2413
2414 // Get address of the box
2415
2416 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2417
2418 // Load the oop from the handle
2419 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2420
2421 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2422
2423 // Slow path will re-enter here
2424 __ bind(lock_done);
2425 }
2426
2427 // Finally just about ready to make the JNI call
2428
2429 // get JNIEnv* which is first argument to native
2430 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2431
2432 // Now set thread in native
2433 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2434
2435 __ call(RuntimeAddress(native_func));
2436
2437 // Verify or restore cpu control state after JNI call
2438 __ restore_cpu_control_state_after_jni(rscratch1);
2439
2440 // Unpack native results.
2441 switch (ret_type) {
2442 case T_BOOLEAN: __ c2bool(rax); break;
2443 case T_CHAR : __ movzwl(rax, rax); break;
2444 case T_BYTE : __ sign_extend_byte (rax); break;
2445 case T_SHORT : __ sign_extend_short(rax); break;
2446 case T_INT : /* nothing to do */ break;
2447 case T_DOUBLE :
2448 case T_FLOAT :
2449 // Result is in xmm0 we'll save as needed
2450 break;
2451 case T_ARRAY: // Really a handle
2452 case T_OBJECT: // Really a handle
2453 break; // can't de-handlize until after safepoint check
2454 case T_VOID: break;
2455 case T_LONG: break;
2456 default : ShouldNotReachHere();
2457 }
2458
2459 // Switch thread to "native transition" state before reading the synchronization state.
2460 // This additional state is necessary because reading and testing the synchronization
2461 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2462 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2463 // VM thread changes sync state to synchronizing and suspends threads for GC.
2464 // Thread A is resumed to finish this native method, but doesn't block here since it
2465 // didn't see any synchronization is progress, and escapes.
2466 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2467
2468 // Force this write out before the read below
2469 if (!UseSystemMemoryBarrier) {
2470 __ membar(Assembler::Membar_mask_bits(
2471 Assembler::LoadLoad | Assembler::LoadStore |
2472 Assembler::StoreLoad | Assembler::StoreStore));
2473 }
2474
2475 // check for safepoint operation in progress and/or pending suspend requests
2476 {
2477 Label Continue;
2478 Label slow_path;
2479
2480 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2481
2482 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2483 __ jcc(Assembler::equal, Continue);
2484 __ bind(slow_path);
2485
2486 // Don't use call_VM as it will see a possible pending exception and forward it
2487 // and never return here preventing us from clearing _last_native_pc down below.
2488 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2489 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2490 // by hand.
2491 //
2492 __ vzeroupper();
2493 save_native_result(masm, ret_type, stack_slots);
2494 __ mov(c_rarg0, r15_thread);
2495 __ mov(r12, rsp); // remember sp
2496 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2497 __ andptr(rsp, -16); // align stack as required by ABI
2498 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2499 __ mov(rsp, r12); // restore sp
2500 __ reinit_heapbase();
2501 // Restore any method result value
2502 restore_native_result(masm, ret_type, stack_slots);
2503 __ bind(Continue);
2504 }
2505
2506 // change thread state
2507 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2508
2509 if (method->is_object_wait0()) {
2510 // Check preemption for Object.wait()
2511 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2512 __ cmpptr(rscratch1, NULL_WORD);
2513 __ jccb(Assembler::equal, native_return);
2514 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2515 __ jmp(rscratch1);
2516 __ bind(native_return);
2517
2518 intptr_t the_pc = (intptr_t) __ pc();
2519 oop_maps->add_gc_map(the_pc - start, map);
2520 }
2521
2522
2523 Label reguard;
2524 Label reguard_done;
2525 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2526 __ jcc(Assembler::equal, reguard);
2527 __ bind(reguard_done);
2528
2529 // native result if any is live
2530
2531 // Unlock
2532 Label slow_path_unlock;
2533 Label unlock_done;
2534 if (method->is_synchronized()) {
2535
2536 Label fast_done;
2537
2538 // Get locked oop from the handle we passed to jni
2539 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2540
2541 // Must save rax if it is live now because cmpxchg must use it
2542 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2543 save_native_result(masm, ret_type, stack_slots);
2544 }
2545
2546 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2547
2548 // slow path re-enters here
2549 __ bind(unlock_done);
2550 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2551 restore_native_result(masm, ret_type, stack_slots);
2552 }
2553
2554 __ bind(fast_done);
2555 }
2556 if (DTraceMethodProbes) {
2557 save_native_result(masm, ret_type, stack_slots);
2558 __ mov_metadata(c_rarg1, method());
2559 __ call_VM_leaf(
2560 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2561 r15_thread, c_rarg1);
2562 restore_native_result(masm, ret_type, stack_slots);
2563 }
2564
2565 __ reset_last_Java_frame(false);
2566
2567 // Unbox oop result, e.g. JNIHandles::resolve value.
2568 if (is_reference_type(ret_type)) {
2569 __ resolve_jobject(rax /* value */,
2570 rcx /* tmp */);
2571 }
2572
2573 if (CheckJNICalls) {
2574 // clear_pending_jni_exception_check
2575 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2576 }
2577
2578 // reset handle block
2579 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2580 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2581
2582 // pop our frame
2583
2584 __ leave();
2585
2586 #if INCLUDE_JFR
2587 // We need to do a poll test after unwind in case the sampler
2588 // managed to sample the native frame after returning to Java.
2589 Label L_return;
2590 address poll_test_pc = __ pc();
2591 __ relocate(relocInfo::poll_return_type);
2592 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2593 __ jccb(Assembler::zero, L_return);
2594 __ lea(rscratch1, InternalAddress(poll_test_pc));
2595 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2596 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2597 "polling page return stub not created yet");
2598 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2599 __ jump(RuntimeAddress(stub));
2600 __ bind(L_return);
2601 #endif // INCLUDE_JFR
2602
2603 // Any exception pending?
2604 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2605 __ jcc(Assembler::notEqual, exception_pending);
2606
2607 // Return
2608
2609 __ ret(0);
2610
2611 // Unexpected paths are out of line and go here
2612
2613 // forward the exception
2614 __ bind(exception_pending);
2615
2616 // and forward the exception
2617 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2618
2619 // Slow path locking & unlocking
2620 if (method->is_synchronized()) {
2621
2622 // BEGIN Slow path lock
2623 __ bind(slow_path_lock);
2624
2625 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2626 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2627
2628 // protect the args we've loaded
2629 save_args(masm, total_c_args, c_arg, out_regs);
2630
2631 __ mov(c_rarg0, obj_reg);
2632 __ mov(c_rarg1, lock_reg);
2633 __ mov(c_rarg2, r15_thread);
2634
2635 // Not a leaf but we have last_Java_frame setup as we want.
2636 // We don't want to unmount in case of contention since that would complicate preserving
2637 // the arguments that had already been marshalled into the native convention. So we force
2638 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2639 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2640 __ push_cont_fastpath();
2641 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2642 __ pop_cont_fastpath();
2643 restore_args(masm, total_c_args, c_arg, out_regs);
2644
2645 #ifdef ASSERT
2646 { Label L;
2647 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2648 __ jcc(Assembler::equal, L);
2649 __ stop("no pending exception allowed on exit from monitorenter");
2650 __ bind(L);
2651 }
2652 #endif
2653 __ jmp(lock_done);
2654
2655 // END Slow path lock
2656
2657 // BEGIN Slow path unlock
2658 __ bind(slow_path_unlock);
2659
2660 // If we haven't already saved the native result we must save it now as xmm registers
2661 // are still exposed.
2662 __ vzeroupper();
2663 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2664 save_native_result(masm, ret_type, stack_slots);
2665 }
2666
2667 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2668
2669 __ mov(c_rarg0, obj_reg);
2670 __ mov(c_rarg2, r15_thread);
2671 __ mov(r12, rsp); // remember sp
2672 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2673 __ andptr(rsp, -16); // align stack as required by ABI
2674
2675 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2676 // NOTE that obj_reg == rbx currently
2677 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2678 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2679
2680 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2681 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2682 __ mov(rsp, r12); // restore sp
2683 __ reinit_heapbase();
2684 #ifdef ASSERT
2685 {
2686 Label L;
2687 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2688 __ jcc(Assembler::equal, L);
2689 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2690 __ bind(L);
2691 }
2692 #endif /* ASSERT */
2693
2694 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2695
2696 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2697 restore_native_result(masm, ret_type, stack_slots);
2698 }
2699 __ jmp(unlock_done);
2700
2701 // END Slow path unlock
2702
2703 } // synchronized
2704
2705 // SLOW PATH Reguard the stack if needed
2706
2707 __ bind(reguard);
2708 __ vzeroupper();
2709 save_native_result(masm, ret_type, stack_slots);
2710 __ mov(r12, rsp); // remember sp
2711 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2712 __ andptr(rsp, -16); // align stack as required by ABI
2713 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2714 __ mov(rsp, r12); // restore sp
2715 __ reinit_heapbase();
2716 restore_native_result(masm, ret_type, stack_slots);
2717 // and continue
2718 __ jmp(reguard_done);
2719
2720
2721
2722 __ flush();
2723
2724 nmethod *nm = nmethod::new_native_nmethod(method,
2725 compile_id,
2726 masm->code(),
2727 vep_offset,
2728 frame_complete,
2729 stack_slots / VMRegImpl::slots_per_word,
2730 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2731 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2732 oop_maps);
2733
2734 return nm;
2735 }
2736
2737 // this function returns the adjust size (in number of words) to a c2i adapter
2738 // activation for use during deoptimization
2739 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2740 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2741 }
2742
2743
2744 uint SharedRuntime::out_preserve_stack_slots() {
2745 return 0;
2746 }
2747
2748
2749 // Number of stack slots between incoming argument block and the start of
2750 // a new frame. The PROLOG must add this many slots to the stack. The
2751 // EPILOG must remove this many slots. amd64 needs two slots for
2752 // return address.
2753 uint SharedRuntime::in_preserve_stack_slots() {
2754 return 4 + 2 * VerifyStackAtCalls;
2755 }
2756
2757 VMReg SharedRuntime::thread_register() {
2758 return r15_thread->as_VMReg();
2759 }
2760
2761 //------------------------------generate_deopt_blob----------------------------
2762 void SharedRuntime::generate_deopt_blob() {
2763 // Allocate space for the code
2764 ResourceMark rm;
2765 // Setup code generation tools
2766 int pad = 0;
2767 if (UseAVX > 2) {
2768 pad += 1024;
2769 }
2770 if (UseAPX) {
2771 pad += 1024;
2772 }
2773 #if INCLUDE_JVMCI
2774 if (EnableJVMCI) {
2775 pad += 512; // Increase the buffer size when compiling for JVMCI
2776 }
2777 #endif
2778 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2779 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2780 if (blob != nullptr) {
2781 _deopt_blob = blob->as_deoptimization_blob();
2782 return;
2783 }
2784
2785 CodeBuffer buffer(name, 2560+pad, 1024);
2786 MacroAssembler* masm = new MacroAssembler(&buffer);
2787 int frame_size_in_words;
2788 OopMap* map = nullptr;
2789 OopMapSet *oop_maps = new OopMapSet();
2790
2791 // -------------
2792 // This code enters when returning to a de-optimized nmethod. A return
2793 // address has been pushed on the stack, and return values are in
2794 // registers.
2795 // If we are doing a normal deopt then we were called from the patched
2796 // nmethod from the point we returned to the nmethod. So the return
2797 // address on the stack is wrong by NativeCall::instruction_size
2798 // We will adjust the value so it looks like we have the original return
2799 // address on the stack (like when we eagerly deoptimized).
2800 // In the case of an exception pending when deoptimizing, we enter
2801 // with a return address on the stack that points after the call we patched
2802 // into the exception handler. We have the following register state from,
2803 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2804 // rax: exception oop
2805 // rbx: exception handler
2806 // rdx: throwing pc
2807 // So in this case we simply jam rdx into the useless return address and
2808 // the stack looks just like we want.
2809 //
2810 // At this point we need to de-opt. We save the argument return
2811 // registers. We call the first C routine, fetch_unroll_info(). This
2812 // routine captures the return values and returns a structure which
2813 // describes the current frame size and the sizes of all replacement frames.
2814 // The current frame is compiled code and may contain many inlined
2815 // functions, each with their own JVM state. We pop the current frame, then
2816 // push all the new frames. Then we call the C routine unpack_frames() to
2817 // populate these frames. Finally unpack_frames() returns us the new target
2818 // address. Notice that callee-save registers are BLOWN here; they have
2819 // already been captured in the vframeArray at the time the return PC was
2820 // patched.
2821 address start = __ pc();
2822 Label cont;
2823
2824 // Prolog for non exception case!
2825
2826 // Save everything in sight.
2827 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2828
2829 // Normal deoptimization. Save exec mode for unpack_frames.
2830 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2831 __ jmp(cont);
2832
2833 int reexecute_offset = __ pc() - start;
2834 #if INCLUDE_JVMCI && !defined(COMPILER1)
2835 if (UseJVMCICompiler) {
2836 // JVMCI does not use this kind of deoptimization
2837 __ should_not_reach_here();
2838 }
2839 #endif
2840
2841 // Reexecute case
2842 // return address is the pc describes what bci to do re-execute at
2843
2844 // No need to update map as each call to save_live_registers will produce identical oopmap
2845 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2846
2847 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2848 __ jmp(cont);
2849
2850 #if INCLUDE_JVMCI
2851 Label after_fetch_unroll_info_call;
2852 int implicit_exception_uncommon_trap_offset = 0;
2853 int uncommon_trap_offset = 0;
2854
2855 if (EnableJVMCI) {
2856 implicit_exception_uncommon_trap_offset = __ pc() - start;
2857
2858 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2859 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2860
2861 uncommon_trap_offset = __ pc() - start;
2862
2863 // Save everything in sight.
2864 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2865 // fetch_unroll_info needs to call last_java_frame()
2866 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2867
2868 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2869 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2870
2871 __ movl(r14, Deoptimization::Unpack_reexecute);
2872 __ mov(c_rarg0, r15_thread);
2873 __ movl(c_rarg2, r14); // exec mode
2874 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2875 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2876
2877 __ reset_last_Java_frame(false);
2878
2879 __ jmp(after_fetch_unroll_info_call);
2880 } // EnableJVMCI
2881 #endif // INCLUDE_JVMCI
2882
2883 int exception_offset = __ pc() - start;
2884
2885 // Prolog for exception case
2886
2887 // all registers are dead at this entry point, except for rax, and
2888 // rdx which contain the exception oop and exception pc
2889 // respectively. Set them in TLS and fall thru to the
2890 // unpack_with_exception_in_tls entry point.
2891
2892 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2893 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2894
2895 int exception_in_tls_offset = __ pc() - start;
2896
2897 // new implementation because exception oop is now passed in JavaThread
2898
2899 // Prolog for exception case
2900 // All registers must be preserved because they might be used by LinearScan
2901 // Exceptiop oop and throwing PC are passed in JavaThread
2902 // tos: stack at point of call to method that threw the exception (i.e. only
2903 // args are on the stack, no return address)
2904
2905 // make room on stack for the return address
2906 // It will be patched later with the throwing pc. The correct value is not
2907 // available now because loading it from memory would destroy registers.
2908 __ push(0);
2909
2910 // Save everything in sight.
2911 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2912
2913 // Now it is safe to overwrite any register
2914
2915 // Deopt during an exception. Save exec mode for unpack_frames.
2916 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2917
2918 // load throwing pc from JavaThread and patch it as the return address
2919 // of the current frame. Then clear the field in JavaThread
2920
2921 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2922 __ movptr(Address(rbp, wordSize), rdx);
2923 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2924
2925 #ifdef ASSERT
2926 // verify that there is really an exception oop in JavaThread
2927 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2928 __ verify_oop(rax);
2929
2930 // verify that there is no pending exception
2931 Label no_pending_exception;
2932 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2933 __ testptr(rax, rax);
2934 __ jcc(Assembler::zero, no_pending_exception);
2935 __ stop("must not have pending exception here");
2936 __ bind(no_pending_exception);
2937 #endif
2938
2939 __ bind(cont);
2940
2941 // Call C code. Need thread and this frame, but NOT official VM entry
2942 // crud. We cannot block on this call, no GC can happen.
2943 //
2944 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2945
2946 // fetch_unroll_info needs to call last_java_frame().
2947
2948 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2949 #ifdef ASSERT
2950 { Label L;
2951 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2952 __ jcc(Assembler::equal, L);
2953 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2954 __ bind(L);
2955 }
2956 #endif // ASSERT
2957 __ mov(c_rarg0, r15_thread);
2958 __ movl(c_rarg1, r14); // exec_mode
2959 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2960
2961 // Need to have an oopmap that tells fetch_unroll_info where to
2962 // find any register it might need.
2963 oop_maps->add_gc_map(__ pc() - start, map);
2964
2965 __ reset_last_Java_frame(false);
2966
2967 #if INCLUDE_JVMCI
2968 if (EnableJVMCI) {
2969 __ bind(after_fetch_unroll_info_call);
2970 }
2971 #endif
2972
2973 // Load UnrollBlock* into rdi
2974 __ mov(rdi, rax);
2975
2976 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2977 Label noException;
2978 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2979 __ jcc(Assembler::notEqual, noException);
2980 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2981 // QQQ this is useless it was null above
2982 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2983 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2984 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2985
2986 __ verify_oop(rax);
2987
2988 // Overwrite the result registers with the exception results.
2989 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2990 // I think this is useless
2991 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2992
2993 __ bind(noException);
2994
2995 // Only register save data is on the stack.
2996 // Now restore the result registers. Everything else is either dead
2997 // or captured in the vframeArray.
2998 RegisterSaver::restore_result_registers(masm);
2999
3000 // All of the register save area has been popped of the stack. Only the
3001 // return address remains.
3002
3003 // Pop all the frames we must move/replace.
3004 //
3005 // Frame picture (youngest to oldest)
3006 // 1: self-frame (no frame link)
3007 // 2: deopting frame (no frame link)
3008 // 3: caller of deopting frame (could be compiled/interpreted).
3009 //
3010 // Note: by leaving the return address of self-frame on the stack
3011 // and using the size of frame 2 to adjust the stack
3012 // when we are done the return to frame 3 will still be on the stack.
3013
3014 // Pop deoptimized frame
3015 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3016 __ addptr(rsp, rcx);
3017
3018 // rsp should be pointing at the return address to the caller (3)
3019
3020 // Pick up the initial fp we should save
3021 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3022 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3023
3024 #ifdef ASSERT
3025 // Compilers generate code that bang the stack by as much as the
3026 // interpreter would need. So this stack banging should never
3027 // trigger a fault. Verify that it does not on non product builds.
3028 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3029 __ bang_stack_size(rbx, rcx);
3030 #endif
3031
3032 // Load address of array of frame pcs into rcx
3033 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3034
3035 // Trash the old pc
3036 __ addptr(rsp, wordSize);
3037
3038 // Load address of array of frame sizes into rsi
3039 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3040
3041 // Load counter into rdx
3042 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3043
3044 // Now adjust the caller's stack to make up for the extra locals
3045 // but record the original sp so that we can save it in the skeletal interpreter
3046 // frame and the stack walking of interpreter_sender will get the unextended sp
3047 // value and not the "real" sp value.
3048
3049 const Register sender_sp = r8;
3050
3051 __ mov(sender_sp, rsp);
3052 __ movl(rbx, Address(rdi,
3053 Deoptimization::UnrollBlock::
3054 caller_adjustment_offset()));
3055 __ subptr(rsp, rbx);
3056
3057 // Push interpreter frames in a loop
3058 Label loop;
3059 __ bind(loop);
3060 __ movptr(rbx, Address(rsi, 0)); // Load frame size
3061 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
3062 __ pushptr(Address(rcx, 0)); // Save return address
3063 __ enter(); // Save old & set new ebp
3064 __ subptr(rsp, rbx); // Prolog
3065 // This value is corrected by layout_activation_impl
3066 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3067 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3068 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
3069 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
3070 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
3071 __ decrementl(rdx); // Decrement counter
3072 __ jcc(Assembler::notZero, loop);
3073 __ pushptr(Address(rcx, 0)); // Save final return address
3074
3075 // Re-push self-frame
3076 __ enter(); // Save old & set new ebp
3077
3078 // Allocate a full sized register save area.
3079 // Return address and rbp are in place, so we allocate two less words.
3080 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3081
3082 // Restore frame locals after moving the frame
3083 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3084 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3085
3086 // Call C code. Need thread but NOT official VM entry
3087 // crud. We cannot block on this call, no GC can happen. Call should
3088 // restore return values to their stack-slots with the new SP.
3089 //
3090 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3091
3092 // Use rbp because the frames look interpreted now
3093 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3094 // Don't need the precise return PC here, just precise enough to point into this code blob.
3095 address the_pc = __ pc();
3096 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3097
3098 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
3099 __ mov(c_rarg0, r15_thread);
3100 __ movl(c_rarg1, r14); // second arg: exec_mode
3101 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3102 // Revert SP alignment after call since we're going to do some SP relative addressing below
3103 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3104
3105 // Set an oopmap for the call site
3106 // Use the same PC we used for the last java frame
3107 oop_maps->add_gc_map(the_pc - start,
3108 new OopMap( frame_size_in_words, 0 ));
3109
3110 // Clear fp AND pc
3111 __ reset_last_Java_frame(true);
3112
3113 // Collect return values
3114 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3115 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3116 // I think this is useless (throwing pc?)
3117 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3118
3119 // Pop self-frame.
3120 __ leave(); // Epilog
3121
3122 // Jump to interpreter
3123 __ ret(0);
3124
3125 // Make sure all code is generated
3126 masm->flush();
3127
3128 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3129 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3130 #if INCLUDE_JVMCI
3131 if (EnableJVMCI) {
3132 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3133 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3134 }
3135 #endif
3136
3137 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3138 }
3139
3140 //------------------------------generate_handler_blob------
3141 //
3142 // Generate a special Compile2Runtime blob that saves all registers,
3143 // and setup oopmap.
3144 //
3145 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3146 assert(StubRoutines::forward_exception_entry() != nullptr,
3147 "must be generated before");
3148 assert(is_polling_page_id(id), "expected a polling page stub id");
3149
3150 // Allocate space for the code. Setup code generation tools.
3151 const char* name = SharedRuntime::stub_name(id);
3152 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3153 if (blob != nullptr) {
3154 return blob->as_safepoint_blob();
3155 }
3156
3157 ResourceMark rm;
3158 OopMapSet *oop_maps = new OopMapSet();
3159 OopMap* map;
3160 CodeBuffer buffer(name, 2548, 1024);
3161 MacroAssembler* masm = new MacroAssembler(&buffer);
3162
3163 address start = __ pc();
3164 address call_pc = nullptr;
3165 int frame_size_in_words;
3166 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3167 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3168
3169 // Make room for return address (or push it again)
3170 if (!cause_return) {
3171 __ push(rbx);
3172 }
3173
3174 // Save registers, fpu state, and flags
3175 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3176
3177 // The following is basically a call_VM. However, we need the precise
3178 // address of the call in order to generate an oopmap. Hence, we do all the
3179 // work ourselves.
3180
3181 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3182
3183 // The return address must always be correct so that frame constructor never
3184 // sees an invalid pc.
3185
3186 if (!cause_return) {
3187 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3188 // Additionally, rbx is a callee saved register and we can look at it later to determine
3189 // if someone changed the return address for us!
3190 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3191 __ movptr(Address(rbp, wordSize), rbx);
3192 }
3193
3194 // Do the call
3195 __ mov(c_rarg0, r15_thread);
3196 __ call(RuntimeAddress(call_ptr));
3197
3198 // Set an oopmap for the call site. This oopmap will map all
3199 // oop-registers and debug-info registers as callee-saved. This
3200 // will allow deoptimization at this safepoint to find all possible
3201 // debug-info recordings, as well as let GC find all oops.
3202
3203 oop_maps->add_gc_map( __ pc() - start, map);
3204
3205 Label noException;
3206
3207 __ reset_last_Java_frame(false);
3208
3209 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3210 __ jcc(Assembler::equal, noException);
3211
3212 // Exception pending
3213
3214 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3215
3216 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3217
3218 // No exception case
3219 __ bind(noException);
3220
3221 Label no_adjust;
3222 #ifdef ASSERT
3223 Label bail;
3224 #endif
3225 if (!cause_return) {
3226 Label no_prefix, not_special, check_rex_prefix;
3227
3228 // If our stashed return pc was modified by the runtime we avoid touching it
3229 __ cmpptr(rbx, Address(rbp, wordSize));
3230 __ jcc(Assembler::notEqual, no_adjust);
3231
3232 // Skip over the poll instruction.
3233 // See NativeInstruction::is_safepoint_poll()
3234 // Possible encodings:
3235 // 85 00 test %eax,(%rax)
3236 // 85 01 test %eax,(%rcx)
3237 // 85 02 test %eax,(%rdx)
3238 // 85 03 test %eax,(%rbx)
3239 // 85 06 test %eax,(%rsi)
3240 // 85 07 test %eax,(%rdi)
3241 //
3242 // 41 85 00 test %eax,(%r8)
3243 // 41 85 01 test %eax,(%r9)
3244 // 41 85 02 test %eax,(%r10)
3245 // 41 85 03 test %eax,(%r11)
3246 // 41 85 06 test %eax,(%r14)
3247 // 41 85 07 test %eax,(%r15)
3248 //
3249 // 85 04 24 test %eax,(%rsp)
3250 // 41 85 04 24 test %eax,(%r12)
3251 // 85 45 00 test %eax,0x0(%rbp)
3252 // 41 85 45 00 test %eax,0x0(%r13)
3253 //
3254 // Notes:
3255 // Format of legacy MAP0 test instruction:-
3256 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3257 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3258 // operand and base register of memory operand is b/w [0-8), hence we do not require
3259 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3260 // is why two bytes encoding is sufficient here.
3261 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3262 // register of memory operand is 1000, thus we need additional REX prefix in this case,
3263 // there by adding additional byte to instruction encoding.
3264 // o In case BASE register is one of the 32 extended GPR registers available only on targets
3265 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3266 // most significant two bits of 5 bit register encoding.
3267
3268 if (VM_Version::supports_apx_f()) {
3269 __ cmpb(Address(rbx, 0), Assembler::REX2);
3270 __ jccb(Assembler::notEqual, check_rex_prefix);
3271 __ addptr(rbx, 2);
3272 __ bind(check_rex_prefix);
3273 }
3274 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3275 __ jccb(Assembler::notEqual, no_prefix);
3276 __ addptr(rbx, 1);
3277 __ bind(no_prefix);
3278 #ifdef ASSERT
3279 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3280 #endif
3281 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3282 // r12/rsp 0x04
3283 // r13/rbp 0x05
3284 __ movzbq(rcx, Address(rbx, 1));
3285 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3286 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3287 __ cmpptr(rcx, 1);
3288 __ jccb(Assembler::above, not_special);
3289 __ addptr(rbx, 1);
3290 __ bind(not_special);
3291 #ifdef ASSERT
3292 // Verify the correct encoding of the poll we're about to skip.
3293 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3294 __ jcc(Assembler::notEqual, bail);
3295 // Mask out the modrm bits
3296 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3297 // rax encodes to 0, so if the bits are nonzero it's incorrect
3298 __ jcc(Assembler::notZero, bail);
3299 #endif
3300 // Adjust return pc forward to step over the safepoint poll instruction
3301 __ addptr(rbx, 2);
3302 __ movptr(Address(rbp, wordSize), rbx);
3303 }
3304
3305 __ bind(no_adjust);
3306 // Normal exit, restore registers and exit.
3307 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3308 __ ret(0);
3309
3310 #ifdef ASSERT
3311 __ bind(bail);
3312 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3313 #endif
3314
3315 // Make sure all code is generated
3316 masm->flush();
3317
3318 // Fill-out other meta info
3319 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3320
3321 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3322 return sp_blob;
3323 }
3324
3325 //
3326 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3327 //
3328 // Generate a stub that calls into vm to find out the proper destination
3329 // of a java call. All the argument registers are live at this point
3330 // but since this is generic code we don't know what they are and the caller
3331 // must do any gc of the args.
3332 //
3333 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3334 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3335 assert(is_resolve_id(id), "expected a resolve stub id");
3336
3337 const char* name = SharedRuntime::stub_name(id);
3338 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3339 if (blob != nullptr) {
3340 return blob->as_runtime_stub();
3341 }
3342
3343 // allocate space for the code
3344 ResourceMark rm;
3345 CodeBuffer buffer(name, 1552, 512);
3346 MacroAssembler* masm = new MacroAssembler(&buffer);
3347
3348 int frame_size_in_words;
3349
3350 OopMapSet *oop_maps = new OopMapSet();
3351 OopMap* map = nullptr;
3352
3353 int start = __ offset();
3354
3355 // No need to save vector registers since they are caller-saved anyway.
3356 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3357
3358 int frame_complete = __ offset();
3359
3360 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3361
3362 __ mov(c_rarg0, r15_thread);
3363
3364 __ call(RuntimeAddress(destination));
3365
3366
3367 // Set an oopmap for the call site.
3368 // We need this not only for callee-saved registers, but also for volatile
3369 // registers that the compiler might be keeping live across a safepoint.
3370
3371 oop_maps->add_gc_map( __ offset() - start, map);
3372
3373 // rax contains the address we are going to jump to assuming no exception got installed
3374
3375 // clear last_Java_sp
3376 __ reset_last_Java_frame(false);
3377 // check for pending exceptions
3378 Label pending;
3379 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3380 __ jcc(Assembler::notEqual, pending);
3381
3382 // get the returned Method*
3383 __ get_vm_result_metadata(rbx);
3384 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3385
3386 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3387
3388 RegisterSaver::restore_live_registers(masm);
3389
3390 // We are back to the original state on entry and ready to go.
3391
3392 __ jmp(rax);
3393
3394 // Pending exception after the safepoint
3395
3396 __ bind(pending);
3397
3398 RegisterSaver::restore_live_registers(masm);
3399
3400 // exception pending => remove activation and forward to exception handler
3401
3402 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3403
3404 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3405 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3406
3407 // -------------
3408 // make sure all code is generated
3409 masm->flush();
3410
3411 // return the blob
3412 // frame_size_words or bytes??
3413 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3414
3415 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3416 return rs_blob;
3417 }
3418
3419 // Continuation point for throwing of implicit exceptions that are
3420 // not handled in the current activation. Fabricates an exception
3421 // oop and initiates normal exception dispatching in this
3422 // frame. Since we need to preserve callee-saved values (currently
3423 // only for C2, but done for C1 as well) we need a callee-saved oop
3424 // map and therefore have to make these stubs into RuntimeStubs
3425 // rather than BufferBlobs. If the compiler needs all registers to
3426 // be preserved between the fault point and the exception handler
3427 // then it must assume responsibility for that in
3428 // AbstractCompiler::continuation_for_implicit_null_exception or
3429 // continuation_for_implicit_division_by_zero_exception. All other
3430 // implicit exceptions (e.g., NullPointerException or
3431 // AbstractMethodError on entry) are either at call sites or
3432 // otherwise assume that stack unwinding will be initiated, so
3433 // caller saved registers were assumed volatile in the compiler.
3434 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3435 assert(is_throw_id(id), "expected a throw stub id");
3436
3437 const char* name = SharedRuntime::stub_name(id);
3438
3439 // Information about frame layout at time of blocking runtime call.
3440 // Note that we only have to preserve callee-saved registers since
3441 // the compilers are responsible for supplying a continuation point
3442 // if they expect all registers to be preserved.
3443 enum layout {
3444 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3445 rbp_off2,
3446 return_off,
3447 return_off2,
3448 framesize // inclusive of return address
3449 };
3450
3451 int insts_size = 512;
3452 int locs_size = 64;
3453
3454 const char* timer_msg = "SharedRuntime generate_throw_exception";
3455 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3456
3457 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3458 if (blob != nullptr) {
3459 return blob->as_runtime_stub();
3460 }
3461
3462 ResourceMark rm;
3463 CodeBuffer code(name, insts_size, locs_size);
3464 OopMapSet* oop_maps = new OopMapSet();
3465 MacroAssembler* masm = new MacroAssembler(&code);
3466
3467 address start = __ pc();
3468
3469 // This is an inlined and slightly modified version of call_VM
3470 // which has the ability to fetch the return PC out of
3471 // thread-local storage and also sets up last_Java_sp slightly
3472 // differently than the real call_VM
3473
3474 __ enter(); // required for proper stackwalking of RuntimeStub frame
3475
3476 assert(is_even(framesize/2), "sp not 16-byte aligned");
3477
3478 // return address and rbp are already in place
3479 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3480
3481 int frame_complete = __ pc() - start;
3482
3483 // Set up last_Java_sp and last_Java_fp
3484 address the_pc = __ pc();
3485 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3486 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3487
3488 // Call runtime
3489 __ movptr(c_rarg0, r15_thread);
3490 BLOCK_COMMENT("call runtime_entry");
3491 __ call(RuntimeAddress(runtime_entry));
3492
3493 // Generate oop map
3494 OopMap* map = new OopMap(framesize, 0);
3495
3496 oop_maps->add_gc_map(the_pc - start, map);
3497
3498 __ reset_last_Java_frame(true);
3499
3500 __ leave(); // required for proper stackwalking of RuntimeStub frame
3501
3502 // check for pending exceptions
3503 #ifdef ASSERT
3504 Label L;
3505 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3506 __ jcc(Assembler::notEqual, L);
3507 __ should_not_reach_here();
3508 __ bind(L);
3509 #endif // ASSERT
3510 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3511
3512
3513 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3514 RuntimeStub* stub =
3515 RuntimeStub::new_runtime_stub(name,
3516 &code,
3517 frame_complete,
3518 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3519 oop_maps, false);
3520 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3521
3522 return stub;
3523 }
3524
3525 //------------------------------Montgomery multiplication------------------------
3526 //
3527
3528 #ifndef _WINDOWS
3529
3530 // Subtract 0:b from carry:a. Return carry.
3531 static julong
3532 sub(julong a[], julong b[], julong carry, long len) {
3533 long long i = 0, cnt = len;
3534 julong tmp;
3535 asm volatile("clc; "
3536 "0: ; "
3537 "mov (%[b], %[i], 8), %[tmp]; "
3538 "sbb %[tmp], (%[a], %[i], 8); "
3539 "inc %[i]; dec %[cnt]; "
3540 "jne 0b; "
3541 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3542 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3543 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3544 : "memory");
3545 return tmp;
3546 }
3547
3548 // Multiply (unsigned) Long A by Long B, accumulating the double-
3549 // length result into the accumulator formed of T0, T1, and T2.
3550 #define MACC(A, B, T0, T1, T2) \
3551 do { \
3552 unsigned long hi, lo; \
3553 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3554 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3555 : "r"(A), "a"(B) : "cc"); \
3556 } while(0)
3557
3558 // As above, but add twice the double-length result into the
3559 // accumulator.
3560 #define MACC2(A, B, T0, T1, T2) \
3561 do { \
3562 unsigned long hi, lo; \
3563 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3564 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3565 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3566 : "r"(A), "a"(B) : "cc"); \
3567 } while(0)
3568
3569 #else //_WINDOWS
3570
3571 static julong
3572 sub(julong a[], julong b[], julong carry, long len) {
3573 long i;
3574 julong tmp;
3575 unsigned char c = 1;
3576 for (i = 0; i < len; i++) {
3577 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3578 a[i] = tmp;
3579 }
3580 c = _addcarry_u64(c, carry, ~0, &tmp);
3581 return tmp;
3582 }
3583
3584 // Multiply (unsigned) Long A by Long B, accumulating the double-
3585 // length result into the accumulator formed of T0, T1, and T2.
3586 #define MACC(A, B, T0, T1, T2) \
3587 do { \
3588 julong hi, lo; \
3589 lo = _umul128(A, B, &hi); \
3590 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3591 c = _addcarry_u64(c, hi, T1, &T1); \
3592 _addcarry_u64(c, T2, 0, &T2); \
3593 } while(0)
3594
3595 // As above, but add twice the double-length result into the
3596 // accumulator.
3597 #define MACC2(A, B, T0, T1, T2) \
3598 do { \
3599 julong hi, lo; \
3600 lo = _umul128(A, B, &hi); \
3601 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3602 c = _addcarry_u64(c, hi, T1, &T1); \
3603 _addcarry_u64(c, T2, 0, &T2); \
3604 c = _addcarry_u64(0, lo, T0, &T0); \
3605 c = _addcarry_u64(c, hi, T1, &T1); \
3606 _addcarry_u64(c, T2, 0, &T2); \
3607 } while(0)
3608
3609 #endif //_WINDOWS
3610
3611 // Fast Montgomery multiplication. The derivation of the algorithm is
3612 // in A Cryptographic Library for the Motorola DSP56000,
3613 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3614
3615 static void NOINLINE
3616 montgomery_multiply(julong a[], julong b[], julong n[],
3617 julong m[], julong inv, int len) {
3618 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3619 int i;
3620
3621 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3622
3623 for (i = 0; i < len; i++) {
3624 int j;
3625 for (j = 0; j < i; j++) {
3626 MACC(a[j], b[i-j], t0, t1, t2);
3627 MACC(m[j], n[i-j], t0, t1, t2);
3628 }
3629 MACC(a[i], b[0], t0, t1, t2);
3630 m[i] = t0 * inv;
3631 MACC(m[i], n[0], t0, t1, t2);
3632
3633 assert(t0 == 0, "broken Montgomery multiply");
3634
3635 t0 = t1; t1 = t2; t2 = 0;
3636 }
3637
3638 for (i = len; i < 2*len; i++) {
3639 int j;
3640 for (j = i-len+1; j < len; j++) {
3641 MACC(a[j], b[i-j], t0, t1, t2);
3642 MACC(m[j], n[i-j], t0, t1, t2);
3643 }
3644 m[i-len] = t0;
3645 t0 = t1; t1 = t2; t2 = 0;
3646 }
3647
3648 while (t0)
3649 t0 = sub(m, n, t0, len);
3650 }
3651
3652 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3653 // multiplies so it should be up to 25% faster than Montgomery
3654 // multiplication. However, its loop control is more complex and it
3655 // may actually run slower on some machines.
3656
3657 static void NOINLINE
3658 montgomery_square(julong a[], julong n[],
3659 julong m[], julong inv, int len) {
3660 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3661 int i;
3662
3663 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3664
3665 for (i = 0; i < len; i++) {
3666 int j;
3667 int end = (i+1)/2;
3668 for (j = 0; j < end; j++) {
3669 MACC2(a[j], a[i-j], t0, t1, t2);
3670 MACC(m[j], n[i-j], t0, t1, t2);
3671 }
3672 if ((i & 1) == 0) {
3673 MACC(a[j], a[j], t0, t1, t2);
3674 }
3675 for (; j < i; j++) {
3676 MACC(m[j], n[i-j], t0, t1, t2);
3677 }
3678 m[i] = t0 * inv;
3679 MACC(m[i], n[0], t0, t1, t2);
3680
3681 assert(t0 == 0, "broken Montgomery square");
3682
3683 t0 = t1; t1 = t2; t2 = 0;
3684 }
3685
3686 for (i = len; i < 2*len; i++) {
3687 int start = i-len+1;
3688 int end = start + (len - start)/2;
3689 int j;
3690 for (j = start; j < end; j++) {
3691 MACC2(a[j], a[i-j], t0, t1, t2);
3692 MACC(m[j], n[i-j], t0, t1, t2);
3693 }
3694 if ((i & 1) == 0) {
3695 MACC(a[j], a[j], t0, t1, t2);
3696 }
3697 for (; j < len; j++) {
3698 MACC(m[j], n[i-j], t0, t1, t2);
3699 }
3700 m[i-len] = t0;
3701 t0 = t1; t1 = t2; t2 = 0;
3702 }
3703
3704 while (t0)
3705 t0 = sub(m, n, t0, len);
3706 }
3707
3708 // Swap words in a longword.
3709 static julong swap(julong x) {
3710 return (x << 32) | (x >> 32);
3711 }
3712
3713 // Copy len longwords from s to d, word-swapping as we go. The
3714 // destination array is reversed.
3715 static void reverse_words(julong *s, julong *d, int len) {
3716 d += len;
3717 while(len-- > 0) {
3718 d--;
3719 *d = swap(*s);
3720 s++;
3721 }
3722 }
3723
3724 // The threshold at which squaring is advantageous was determined
3725 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3726 #define MONTGOMERY_SQUARING_THRESHOLD 64
3727
3728 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3729 jint len, jlong inv,
3730 jint *m_ints) {
3731 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3732 int longwords = len/2;
3733
3734 // Make very sure we don't use so much space that the stack might
3735 // overflow. 512 jints corresponds to an 16384-bit integer and
3736 // will use here a total of 8k bytes of stack space.
3737 int divisor = sizeof(julong) * 4;
3738 guarantee(longwords <= 8192 / divisor, "must be");
3739 int total_allocation = longwords * sizeof (julong) * 4;
3740 julong *scratch = (julong *)alloca(total_allocation);
3741
3742 // Local scratch arrays
3743 julong
3744 *a = scratch + 0 * longwords,
3745 *b = scratch + 1 * longwords,
3746 *n = scratch + 2 * longwords,
3747 *m = scratch + 3 * longwords;
3748
3749 reverse_words((julong *)a_ints, a, longwords);
3750 reverse_words((julong *)b_ints, b, longwords);
3751 reverse_words((julong *)n_ints, n, longwords);
3752
3753 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3754
3755 reverse_words(m, (julong *)m_ints, longwords);
3756 }
3757
3758 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3759 jint len, jlong inv,
3760 jint *m_ints) {
3761 assert(len % 2 == 0, "array length in montgomery_square must be even");
3762 int longwords = len/2;
3763
3764 // Make very sure we don't use so much space that the stack might
3765 // overflow. 512 jints corresponds to an 16384-bit integer and
3766 // will use here a total of 6k bytes of stack space.
3767 int divisor = sizeof(julong) * 3;
3768 guarantee(longwords <= (8192 / divisor), "must be");
3769 int total_allocation = longwords * sizeof (julong) * 3;
3770 julong *scratch = (julong *)alloca(total_allocation);
3771
3772 // Local scratch arrays
3773 julong
3774 *a = scratch + 0 * longwords,
3775 *n = scratch + 1 * longwords,
3776 *m = scratch + 2 * longwords;
3777
3778 reverse_words((julong *)a_ints, a, longwords);
3779 reverse_words((julong *)n_ints, n, longwords);
3780
3781 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3782 ::montgomery_square(a, n, m, (julong)inv, longwords);
3783 } else {
3784 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3785 }
3786
3787 reverse_words(m, (julong *)m_ints, longwords);
3788 }
3789
3790 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3791 BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3792 if (buf == nullptr) {
3793 return nullptr;
3794 }
3795 CodeBuffer buffer(buf);
3796 short buffer_locs[20];
3797 buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3798 sizeof(buffer_locs)/sizeof(relocInfo));
3799
3800 MacroAssembler* masm = new MacroAssembler(&buffer);
3801
3802 const Array<SigEntry>* sig_vk = vk->extended_sig();
3803 const Array<VMRegPair>* regs = vk->return_regs();
3804
3805 int pack_fields_jobject_off = __ offset();
3806 // Resolve pre-allocated buffer from JNI handle.
3807 // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3808 __ movptr(rax, Address(r13, 0));
3809 __ resolve_jobject(rax /* value */,
3810 r12 /* tmp */);
3811 __ movptr(Address(r13, 0), rax);
3812
3813 int pack_fields_off = __ offset();
3814
3815 int j = 1;
3816 for (int i = 0; i < sig_vk->length(); i++) {
3817 BasicType bt = sig_vk->at(i)._bt;
3818 if (bt == T_METADATA) {
3819 continue;
3820 }
3821 if (bt == T_VOID) {
3822 if (sig_vk->at(i-1)._bt == T_LONG ||
3823 sig_vk->at(i-1)._bt == T_DOUBLE) {
3824 j++;
3825 }
3826 continue;
3827 }
3828 int off = sig_vk->at(i)._offset;
3829 assert(off > 0, "offset in object should be positive");
3830 VMRegPair pair = regs->at(j);
3831 VMReg r_1 = pair.first();
3832 Address to(rax, off);
3833 if (bt == T_FLOAT) {
3834 __ movflt(to, r_1->as_XMMRegister());
3835 } else if (bt == T_DOUBLE) {
3836 __ movdbl(to, r_1->as_XMMRegister());
3837 } else {
3838 Register val = r_1->as_Register();
3839 assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3840 if (is_reference_type(bt)) {
3841 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3842 __ mov(rbx, rax);
3843 Address to_with_rbx(rbx, off);
3844 __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3845 } else {
3846 __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3847 }
3848 }
3849 j++;
3850 }
3851 assert(j == regs->length(), "missed a field?");
3852 if (vk->supports_nullable_layouts()) {
3853 // Set the null marker
3854 __ movb(Address(rax, vk->null_marker_offset()), 1);
3855 }
3856 __ ret(0);
3857
3858 int unpack_fields_off = __ offset();
3859
3860 Label skip;
3861 Label not_null;
3862 __ testptr(rax, rax);
3863 __ jcc(Assembler::notZero, not_null);
3864
3865 // Return value is null. Zero all registers because the runtime requires a canonical
3866 // representation of a flat null.
3867 j = 1;
3868 for (int i = 0; i < sig_vk->length(); i++) {
3869 BasicType bt = sig_vk->at(i)._bt;
3870 if (bt == T_METADATA) {
3871 continue;
3872 }
3873 if (bt == T_VOID) {
3874 if (sig_vk->at(i-1)._bt == T_LONG ||
3875 sig_vk->at(i-1)._bt == T_DOUBLE) {
3876 j++;
3877 }
3878 continue;
3879 }
3880
3881 VMRegPair pair = regs->at(j);
3882 VMReg r_1 = pair.first();
3883 if (r_1->is_XMMRegister()) {
3884 __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3885 } else {
3886 __ xorl(r_1->as_Register(), r_1->as_Register());
3887 }
3888 j++;
3889 }
3890 __ jmp(skip);
3891 __ bind(not_null);
3892
3893 j = 1;
3894 for (int i = 0; i < sig_vk->length(); i++) {
3895 BasicType bt = sig_vk->at(i)._bt;
3896 if (bt == T_METADATA) {
3897 continue;
3898 }
3899 if (bt == T_VOID) {
3900 if (sig_vk->at(i-1)._bt == T_LONG ||
3901 sig_vk->at(i-1)._bt == T_DOUBLE) {
3902 j++;
3903 }
3904 continue;
3905 }
3906 int off = sig_vk->at(i)._offset;
3907 assert(off > 0, "offset in object should be positive");
3908 VMRegPair pair = regs->at(j);
3909 VMReg r_1 = pair.first();
3910 VMReg r_2 = pair.second();
3911 Address from(rax, off);
3912 if (bt == T_FLOAT) {
3913 __ movflt(r_1->as_XMMRegister(), from);
3914 } else if (bt == T_DOUBLE) {
3915 __ movdbl(r_1->as_XMMRegister(), from);
3916 } else if (bt == T_OBJECT || bt == T_ARRAY) {
3917 assert_different_registers(rax, r_1->as_Register());
3918 __ load_heap_oop(r_1->as_Register(), from);
3919 } else {
3920 assert(is_java_primitive(bt), "unexpected basic type");
3921 assert_different_registers(rax, r_1->as_Register());
3922 size_t size_in_bytes = type2aelembytes(bt);
3923 __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3924 }
3925 j++;
3926 }
3927 assert(j == regs->length(), "missed a field?");
3928
3929 __ bind(skip);
3930 __ ret(0);
3931
3932 __ flush();
3933
3934 return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3935 }
3936
3937 #if INCLUDE_JFR
3938
3939 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3940 // It returns a jobject handle to the event writer.
3941 // The handle is dereferenced and the return value is the event writer oop.
3942 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3943 enum layout {
3944 rbp_off,
3945 rbpH_off,
3946 return_off,
3947 return_off2,
3948 framesize // inclusive of return address
3949 };
3950
3951 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3952 CodeBuffer code(name, 1024, 64);
3953 MacroAssembler* masm = new MacroAssembler(&code);
3954 address start = __ pc();
3955
3956 __ enter();
3957 address the_pc = __ pc();
3958
3959 int frame_complete = the_pc - start;
3960
3961 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3962 __ movptr(c_rarg0, r15_thread);
3963 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3964 __ reset_last_Java_frame(true);
3965
3966 // rax is jobject handle result, unpack and process it through a barrier.
3967 __ resolve_global_jobject(rax, c_rarg0);
3968
3969 __ leave();
3970 __ ret(0);
3971
3972 OopMapSet* oop_maps = new OopMapSet();
3973 OopMap* map = new OopMap(framesize, 1);
3974 oop_maps->add_gc_map(frame_complete, map);
3975
3976 RuntimeStub* stub =
3977 RuntimeStub::new_runtime_stub(name,
3978 &code,
3979 frame_complete,
3980 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3981 oop_maps,
3982 false);
3983 return stub;
3984 }
3985
3986 // For c2: call to return a leased buffer.
3987 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3988 enum layout {
3989 rbp_off,
3990 rbpH_off,
3991 return_off,
3992 return_off2,
3993 framesize // inclusive of return address
3994 };
3995
3996 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3997 CodeBuffer code(name, 1024, 64);
3998 MacroAssembler* masm = new MacroAssembler(&code);
3999 address start = __ pc();
4000
4001 __ enter();
4002 address the_pc = __ pc();
4003
4004 int frame_complete = the_pc - start;
4005
4006 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4007 __ movptr(c_rarg0, r15_thread);
4008 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4009 __ reset_last_Java_frame(true);
4010
4011 __ leave();
4012 __ ret(0);
4013
4014 OopMapSet* oop_maps = new OopMapSet();
4015 OopMap* map = new OopMap(framesize, 1);
4016 oop_maps->add_gc_map(frame_complete, map);
4017
4018 RuntimeStub* stub =
4019 RuntimeStub::new_runtime_stub(name,
4020 &code,
4021 frame_complete,
4022 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4023 oop_maps,
4024 false);
4025 return stub;
4026 }
4027
4028 #endif // INCLUDE_JFR