1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "classfile/symbolTable.hpp"
31 #include "code/aotCodeCache.hpp"
32 #include "code/compiledIC.hpp"
33 #include "code/debugInfoRec.hpp"
34 #include "code/nativeInst.hpp"
35 #include "code/vtableStubs.hpp"
36 #include "compiler/oopMap.hpp"
37 #include "gc/shared/collectedHeap.hpp"
38 #include "gc/shared/gcLocker.hpp"
39 #include "gc/shared/barrierSet.hpp"
40 #include "gc/shared/barrierSetAssembler.hpp"
41 #include "interpreter/interpreter.hpp"
42 #include "logging/log.hpp"
43 #include "memory/resourceArea.hpp"
44 #include "memory/universe.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "oops/method.inline.hpp"
47 #include "prims/methodHandles.hpp"
48 #include "runtime/continuation.hpp"
49 #include "runtime/continuationEntry.inline.hpp"
50 #include "runtime/globals.hpp"
51 #include "runtime/jniHandles.hpp"
52 #include "runtime/safepointMechanism.hpp"
53 #include "runtime/sharedRuntime.hpp"
54 #include "runtime/signature.hpp"
55 #include "runtime/stubRoutines.hpp"
56 #include "runtime/timerTrace.hpp"
57 #include "runtime/vframeArray.hpp"
58 #include "runtime/vm_version.hpp"
59 #include "utilities/align.hpp"
60 #include "utilities/checkedCast.hpp"
61 #include "utilities/formatBuffer.hpp"
62 #include "vmreg_x86.inline.hpp"
63 #ifdef COMPILER1
64 #include "c1/c1_Runtime1.hpp"
65 #endif
66 #ifdef COMPILER2
67 #include "opto/runtime.hpp"
68 #endif
69 #if INCLUDE_JVMCI
70 #include "jvmci/jvmciJavaClasses.hpp"
71 #endif
72
73 #define __ masm->
74
75 #ifdef PRODUCT
76 #define BLOCK_COMMENT(str) /* nothing */
77 #else
78 #define BLOCK_COMMENT(str) __ block_comment(str)
79 #endif // PRODUCT
80
81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
82
83 class RegisterSaver {
84 // Capture info about frame layout. Layout offsets are in jint
85 // units because compiler frame slots are jints.
86 #define XSAVE_AREA_BEGIN 160
87 #define XSAVE_AREA_YMM_BEGIN 576
88 #define XSAVE_AREA_EGPRS 960
89 #define XSAVE_AREA_OPMASK_BEGIN 1088
90 #define XSAVE_AREA_ZMM_BEGIN 1152
91 #define XSAVE_AREA_UPPERBANK 1664
92 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
93 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
94 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
95 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
97 enum layout {
98 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
99 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
100 DEF_XMM_OFFS(0),
101 DEF_XMM_OFFS(1),
102 // 2..15 are implied in range usage
103 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
104 DEF_YMM_OFFS(0),
105 DEF_YMM_OFFS(1),
106 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
107 r16H_off,
108 r17_off, r17H_off,
109 r18_off, r18H_off,
110 r19_off, r19H_off,
111 r20_off, r20H_off,
112 r21_off, r21H_off,
113 r22_off, r22H_off,
114 r23_off, r23H_off,
115 r24_off, r24H_off,
116 r25_off, r25H_off,
117 r26_off, r26H_off,
118 r27_off, r27H_off,
119 r28_off, r28H_off,
120 r29_off, r29H_off,
121 r30_off, r30H_off,
122 r31_off, r31H_off,
123 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
124 DEF_OPMASK_OFFS(0),
125 DEF_OPMASK_OFFS(1),
126 // 2..7 are implied in range usage
127 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
128 DEF_ZMM_OFFS(0),
129 DEF_ZMM_OFFS(1),
130 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
131 DEF_ZMM_UPPER_OFFS(16),
132 DEF_ZMM_UPPER_OFFS(17),
133 // 18..31 are implied in range usage
134 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
135 fpu_stateH_end,
136 r15_off, r15H_off,
137 r14_off, r14H_off,
138 r13_off, r13H_off,
139 r12_off, r12H_off,
140 r11_off, r11H_off,
141 r10_off, r10H_off,
142 r9_off, r9H_off,
143 r8_off, r8H_off,
144 rdi_off, rdiH_off,
145 rsi_off, rsiH_off,
146 ignore_off, ignoreH_off, // extra copy of rbp
147 rsp_off, rspH_off,
148 rbx_off, rbxH_off,
149 rdx_off, rdxH_off,
150 rcx_off, rcxH_off,
151 rax_off, raxH_off,
152 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
153 align_off, alignH_off,
154 flags_off, flagsH_off,
155 // The frame sender code expects that rbp will be in the "natural" place and
156 // will override any oopMap setting for it. We must therefore force the layout
157 // so that it agrees with the frame sender code.
158 rbp_off, rbpH_off, // copy of rbp we will restore
159 return_off, returnH_off, // slot for return address
160 reg_save_size // size in compiler stack slots
161 };
162
163 public:
164 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
165 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
166
167 // Offsets into the register save area
168 // Used by deoptimization when it is managing result register
169 // values on its own
170
171 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
172 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
173 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
174 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
175 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
176 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
177
178 // During deoptimization only the result registers need to be restored,
179 // all the other values have already been extracted.
180 static void restore_result_registers(MacroAssembler* masm);
181 };
182
183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
184 int off = 0;
185 int num_xmm_regs = XMMRegister::available_xmm_registers();
186 #if COMPILER2_OR_JVMCI
187 if (save_wide_vectors && UseAVX == 0) {
188 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
189 }
190 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
191 #else
192 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
193 #endif
194
195 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
196 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
197 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
198 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
199 // CodeBlob frame size is in words.
200 int frame_size_in_words = frame_size_in_bytes / wordSize;
201 *total_frame_words = frame_size_in_words;
202
203 // Save registers, fpu state, and flags.
204 // We assume caller has already pushed the return address onto the
205 // stack, so rsp is 8-byte aligned here.
206 // We push rpb twice in this sequence because we want the real rbp
207 // to be under the return like a normal enter.
208
209 __ enter(); // rsp becomes 16-byte aligned here
210 __ pushf();
211 // Make sure rsp stays 16-byte aligned
212 __ subq(rsp, 8);
213 // Push CPU state in multiple of 16 bytes
214 __ save_legacy_gprs();
215 __ push_FPU_state();
216
217
218 // push cpu state handles this on EVEX enabled targets
219 if (save_wide_vectors) {
220 // Save upper half of YMM registers(0..15)
221 int base_addr = XSAVE_AREA_YMM_BEGIN;
222 for (int n = 0; n < 16; n++) {
223 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
224 }
225 if (VM_Version::supports_evex()) {
226 // Save upper half of ZMM registers(0..15)
227 base_addr = XSAVE_AREA_ZMM_BEGIN;
228 for (int n = 0; n < 16; n++) {
229 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
230 }
231 // Save full ZMM registers(16..num_xmm_regs)
232 base_addr = XSAVE_AREA_UPPERBANK;
233 off = 0;
234 int vector_len = Assembler::AVX_512bit;
235 for (int n = 16; n < num_xmm_regs; n++) {
236 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
237 }
238 #if COMPILER2_OR_JVMCI
239 base_addr = XSAVE_AREA_OPMASK_BEGIN;
240 off = 0;
241 for(int n = 0; n < KRegister::number_of_registers; n++) {
242 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
243 }
244 #endif
245 }
246 } else {
247 if (VM_Version::supports_evex()) {
248 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
249 int base_addr = XSAVE_AREA_UPPERBANK;
250 off = 0;
251 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
252 for (int n = 16; n < num_xmm_regs; n++) {
253 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
254 }
255 #if COMPILER2_OR_JVMCI
256 base_addr = XSAVE_AREA_OPMASK_BEGIN;
257 off = 0;
258 for(int n = 0; n < KRegister::number_of_registers; n++) {
259 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
260 }
261 #endif
262 }
263 }
264
265 #if COMPILER2_OR_JVMCI
266 if (UseAPX) {
267 int base_addr = XSAVE_AREA_EGPRS;
268 off = 0;
269 for (int n = 16; n < Register::number_of_registers; n++) {
270 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
271 }
272 }
273 #endif
274
275 __ vzeroupper();
276 if (frame::arg_reg_save_area_bytes != 0) {
277 // Allocate argument register save area
278 __ subptr(rsp, frame::arg_reg_save_area_bytes);
279 }
280
281 // Set an oopmap for the call site. This oopmap will map all
282 // oop-registers and debug-info registers as callee-saved. This
283 // will allow deoptimization at this safepoint to find all possible
284 // debug-info recordings, as well as let GC find all oops.
285
286 OopMapSet *oop_maps = new OopMapSet();
287 OopMap* map = new OopMap(frame_size_in_slots, 0);
288
289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
290
291 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
294 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
295 // rbp location is known implicitly by the frame sender code, needs no oopmap
296 // and the location where rbp was saved by is ignored
297 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
306 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
307
308 if (UseAPX) {
309 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
324 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
325 }
326 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
327 // on EVEX enabled targets, we get it included in the xsave area
328 off = xmm0_off;
329 int delta = xmm1_off - off;
330 for (int n = 0; n < 16; n++) {
331 XMMRegister xmm_name = as_XMMRegister(n);
332 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
333 off += delta;
334 }
335 if (UseAVX > 2) {
336 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
337 off = zmm16_off;
338 delta = zmm17_off - off;
339 for (int n = 16; n < num_xmm_regs; n++) {
340 XMMRegister zmm_name = as_XMMRegister(n);
341 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
342 off += delta;
343 }
344 }
345
346 #if COMPILER2_OR_JVMCI
347 if (save_wide_vectors) {
348 // Save upper half of YMM registers(0..15)
349 off = ymm0_off;
350 delta = ymm1_off - ymm0_off;
351 for (int n = 0; n < 16; n++) {
352 XMMRegister ymm_name = as_XMMRegister(n);
353 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
354 off += delta;
355 }
356 if (VM_Version::supports_evex()) {
357 // Save upper half of ZMM registers(0..15)
358 off = zmm0_off;
359 delta = zmm1_off - zmm0_off;
360 for (int n = 0; n < 16; n++) {
361 XMMRegister zmm_name = as_XMMRegister(n);
362 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
363 off += delta;
364 }
365 }
366 }
367 #endif // COMPILER2_OR_JVMCI
368
369 // %%% These should all be a waste but we'll keep things as they were for now
370 if (true) {
371 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
374 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
375 // rbp location is known implicitly by the frame sender code, needs no oopmap
376 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
385 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
386 if (UseAPX) {
387 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
402 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
403 }
404 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
405 // on EVEX enabled targets, we get it included in the xsave area
406 off = xmm0H_off;
407 delta = xmm1H_off - off;
408 for (int n = 0; n < 16; n++) {
409 XMMRegister xmm_name = as_XMMRegister(n);
410 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
411 off += delta;
412 }
413 if (UseAVX > 2) {
414 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
415 off = zmm16H_off;
416 delta = zmm17H_off - off;
417 for (int n = 16; n < num_xmm_regs; n++) {
418 XMMRegister zmm_name = as_XMMRegister(n);
419 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
420 off += delta;
421 }
422 }
423 }
424
425 return map;
426 }
427
428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
429 int num_xmm_regs = XMMRegister::available_xmm_registers();
430 if (frame::arg_reg_save_area_bytes != 0) {
431 // Pop arg register save area
432 __ addptr(rsp, frame::arg_reg_save_area_bytes);
433 }
434
435 #if COMPILER2_OR_JVMCI
436 if (restore_wide_vectors) {
437 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
438 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
439 }
440 #else
441 assert(!restore_wide_vectors, "vectors are generated only by C2");
442 #endif
443
444 __ vzeroupper();
445
446 // On EVEX enabled targets everything is handled in pop fpu state
447 if (restore_wide_vectors) {
448 // Restore upper half of YMM registers (0..15)
449 int base_addr = XSAVE_AREA_YMM_BEGIN;
450 for (int n = 0; n < 16; n++) {
451 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
452 }
453 if (VM_Version::supports_evex()) {
454 // Restore upper half of ZMM registers (0..15)
455 base_addr = XSAVE_AREA_ZMM_BEGIN;
456 for (int n = 0; n < 16; n++) {
457 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
458 }
459 // Restore full ZMM registers(16..num_xmm_regs)
460 base_addr = XSAVE_AREA_UPPERBANK;
461 int vector_len = Assembler::AVX_512bit;
462 int off = 0;
463 for (int n = 16; n < num_xmm_regs; n++) {
464 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
465 }
466 #if COMPILER2_OR_JVMCI
467 base_addr = XSAVE_AREA_OPMASK_BEGIN;
468 off = 0;
469 for (int n = 0; n < KRegister::number_of_registers; n++) {
470 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
471 }
472 #endif
473 }
474 } else {
475 if (VM_Version::supports_evex()) {
476 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
477 int base_addr = XSAVE_AREA_UPPERBANK;
478 int off = 0;
479 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
480 for (int n = 16; n < num_xmm_regs; n++) {
481 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
482 }
483 #if COMPILER2_OR_JVMCI
484 base_addr = XSAVE_AREA_OPMASK_BEGIN;
485 off = 0;
486 for (int n = 0; n < KRegister::number_of_registers; n++) {
487 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
488 }
489 #endif
490 }
491 }
492
493 #if COMPILER2_OR_JVMCI
494 if (UseAPX) {
495 int base_addr = XSAVE_AREA_EGPRS;
496 int off = 0;
497 for (int n = 16; n < Register::number_of_registers; n++) {
498 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
499 }
500 }
501 #endif
502
503 // Recover CPU state
504 __ pop_FPU_state();
505 __ restore_legacy_gprs();
506 __ addq(rsp, 8);
507 __ popf();
508 // Get the rbp described implicitly by the calling convention (no oopMap)
509 __ pop(rbp);
510 }
511
512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
513
514 // Just restore result register. Only used by deoptimization. By
515 // now any callee save register that needs to be restored to a c2
516 // caller of the deoptee has been extracted into the vframeArray
517 // and will be stuffed into the c2i adapter we create for later
518 // restoration so only result registers need to be restored here.
519
520 // Restore fp result register
521 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
522 // Restore integer result register
523 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
524 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
525
526 // Pop all of the register save are off the stack except the return address
527 __ addptr(rsp, return_offset_in_bytes());
528 }
529
530 // Is vector's size (in bytes) bigger than a size saved by default?
531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
532 bool SharedRuntime::is_wide_vector(int size) {
533 return size > 16;
534 }
535
536 // ---------------------------------------------------------------------------
537 // Read the array of BasicTypes from a signature, and compute where the
538 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
539 // quantities. Values less than VMRegImpl::stack0 are registers, those above
540 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
541 // as framesizes are fixed.
542 // VMRegImpl::stack0 refers to the first slot 0(sp).
543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
544 // Register up to Register::number_of_registers are the 64-bit
545 // integer registers.
546
547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
548 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
549 // units regardless of build. Of course for i486 there is no 64 bit build
550
551 // The Java calling convention is a "shifted" version of the C ABI.
552 // By skipping the first C ABI register we can call non-static jni methods
553 // with small numbers of arguments without having to shuffle the arguments
554 // at all. Since we control the java ABI we ought to at least get some
555 // advantage out of it.
556
557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
558 VMRegPair *regs,
559 int total_args_passed) {
560
561 // Create the mapping between argument positions and
562 // registers.
563 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
564 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
565 };
566 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
567 j_farg0, j_farg1, j_farg2, j_farg3,
568 j_farg4, j_farg5, j_farg6, j_farg7
569 };
570
571
572 uint int_args = 0;
573 uint fp_args = 0;
574 uint stk_args = 0;
575
576 for (int i = 0; i < total_args_passed; i++) {
577 switch (sig_bt[i]) {
578 case T_BOOLEAN:
579 case T_CHAR:
580 case T_BYTE:
581 case T_SHORT:
582 case T_INT:
583 if (int_args < Argument::n_int_register_parameters_j) {
584 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
585 } else {
586 stk_args = align_up(stk_args, 2);
587 regs[i].set1(VMRegImpl::stack2reg(stk_args));
588 stk_args += 1;
589 }
590 break;
591 case T_VOID:
592 // halves of T_LONG or T_DOUBLE
593 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
594 regs[i].set_bad();
595 break;
596 case T_LONG:
597 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
598 // fall through
599 case T_OBJECT:
600 case T_ARRAY:
601 case T_ADDRESS:
602 if (int_args < Argument::n_int_register_parameters_j) {
603 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
604 } else {
605 stk_args = align_up(stk_args, 2);
606 regs[i].set2(VMRegImpl::stack2reg(stk_args));
607 stk_args += 2;
608 }
609 break;
610 case T_FLOAT:
611 if (fp_args < Argument::n_float_register_parameters_j) {
612 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
613 } else {
614 stk_args = align_up(stk_args, 2);
615 regs[i].set1(VMRegImpl::stack2reg(stk_args));
616 stk_args += 1;
617 }
618 break;
619 case T_DOUBLE:
620 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
621 if (fp_args < Argument::n_float_register_parameters_j) {
622 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
623 } else {
624 stk_args = align_up(stk_args, 2);
625 regs[i].set2(VMRegImpl::stack2reg(stk_args));
626 stk_args += 2;
627 }
628 break;
629 default:
630 ShouldNotReachHere();
631 break;
632 }
633 }
634
635 return stk_args;
636 }
637
638 // Same as java_calling_convention() but for multiple return
639 // values. There's no way to store them on the stack so if we don't
640 // have enough registers, multiple values can't be returned.
641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
644 VMRegPair *regs,
645 int total_args_passed) {
646 // Create the mapping between argument positions and
647 // registers.
648 static const Register INT_ArgReg[java_return_convention_max_int] = {
649 rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
650 };
651 static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
652 j_farg0, j_farg1, j_farg2, j_farg3,
653 j_farg4, j_farg5, j_farg6, j_farg7
654 };
655
656
657 uint int_args = 0;
658 uint fp_args = 0;
659
660 for (int i = 0; i < total_args_passed; i++) {
661 switch (sig_bt[i]) {
662 case T_BOOLEAN:
663 case T_CHAR:
664 case T_BYTE:
665 case T_SHORT:
666 case T_INT:
667 if (int_args < Argument::n_int_register_parameters_j+1) {
668 regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
669 int_args++;
670 } else {
671 return -1;
672 }
673 break;
674 case T_VOID:
675 // halves of T_LONG or T_DOUBLE
676 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
677 regs[i].set_bad();
678 break;
679 case T_LONG:
680 assert(sig_bt[i + 1] == T_VOID, "expecting half");
681 // fall through
682 case T_OBJECT:
683 case T_ARRAY:
684 case T_ADDRESS:
685 case T_METADATA:
686 if (int_args < Argument::n_int_register_parameters_j+1) {
687 regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
688 int_args++;
689 } else {
690 return -1;
691 }
692 break;
693 case T_FLOAT:
694 if (fp_args < Argument::n_float_register_parameters_j) {
695 regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
696 fp_args++;
697 } else {
698 return -1;
699 }
700 break;
701 case T_DOUBLE:
702 assert(sig_bt[i + 1] == T_VOID, "expecting half");
703 if (fp_args < Argument::n_float_register_parameters_j) {
704 regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
705 fp_args++;
706 } else {
707 return -1;
708 }
709 break;
710 default:
711 ShouldNotReachHere();
712 break;
713 }
714 }
715
716 return int_args + fp_args;
717 }
718
719 // Patch the callers callsite with entry to compiled code if it exists.
720 static void patch_callers_callsite(MacroAssembler *masm) {
721 Label L;
722 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
723 __ jcc(Assembler::equal, L);
724
725 // Save the current stack pointer
726 __ mov(r13, rsp);
727 // Schedule the branch target address early.
728 // Call into the VM to patch the caller, then jump to compiled callee
729 // rax isn't live so capture return address while we easily can
730 __ movptr(rax, Address(rsp, 0));
731
732 // align stack so push_CPU_state doesn't fault
733 __ andptr(rsp, -(StackAlignmentInBytes));
734 __ push_CPU_state();
735 __ vzeroupper();
736 // VM needs caller's callsite
737 // VM needs target method
738 // This needs to be a long call since we will relocate this adapter to
739 // the codeBuffer and it may not reach
740
741 // Allocate argument register save area
742 if (frame::arg_reg_save_area_bytes != 0) {
743 __ subptr(rsp, frame::arg_reg_save_area_bytes);
744 }
745 __ mov(c_rarg0, rbx);
746 __ mov(c_rarg1, rax);
747 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
748
749 // De-allocate argument register save area
750 if (frame::arg_reg_save_area_bytes != 0) {
751 __ addptr(rsp, frame::arg_reg_save_area_bytes);
752 }
753
754 __ vzeroupper();
755 __ pop_CPU_state();
756 // restore sp
757 __ mov(rsp, r13);
758 __ bind(L);
759 }
760
761 // For each inline type argument, sig includes the list of fields of
762 // the inline type. This utility function computes the number of
763 // arguments for the call if inline types are passed by reference (the
764 // calling convention the interpreter expects).
765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
766 int total_args_passed = 0;
767 if (InlineTypePassFieldsAsArgs) {
768 for (int i = 0; i < sig_extended->length(); i++) {
769 BasicType bt = sig_extended->at(i)._bt;
770 if (bt == T_METADATA) {
771 // In sig_extended, an inline type argument starts with:
772 // T_METADATA, followed by the types of the fields of the
773 // inline type and T_VOID to mark the end of the value
774 // type. Inline types are flattened so, for instance, in the
775 // case of an inline type with an int field and an inline type
776 // field that itself has 2 fields, an int and a long:
777 // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
778 // slot for the T_LONG) T_VOID (inner inline type) T_VOID
779 // (outer inline type)
780 total_args_passed++;
781 int vt = 1;
782 do {
783 i++;
784 BasicType bt = sig_extended->at(i)._bt;
785 BasicType prev_bt = sig_extended->at(i-1)._bt;
786 if (bt == T_METADATA) {
787 vt++;
788 } else if (bt == T_VOID &&
789 prev_bt != T_LONG &&
790 prev_bt != T_DOUBLE) {
791 vt--;
792 }
793 } while (vt != 0);
794 } else {
795 total_args_passed++;
796 }
797 }
798 } else {
799 total_args_passed = sig_extended->length();
800 }
801 return total_args_passed;
802 }
803
804
805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
806 BasicType bt,
807 BasicType prev_bt,
808 size_t size_in_bytes,
809 const VMRegPair& reg_pair,
810 const Address& to,
811 int extraspace,
812 bool is_oop) {
813 if (bt == T_VOID) {
814 assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
815 return;
816 }
817
818 // Say 4 args:
819 // i st_off
820 // 0 32 T_LONG
821 // 1 24 T_VOID
822 // 2 16 T_OBJECT
823 // 3 8 T_BOOL
824 // - 0 return address
825 //
826 // However to make thing extra confusing. Because we can fit a long/double in
827 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
828 // leaves one slot empty and only stores to a single slot. In this case the
829 // slot that is occupied is the T_VOID slot. See I said it was confusing.
830
831 bool wide = (size_in_bytes == wordSize);
832 VMReg r_1 = reg_pair.first();
833 VMReg r_2 = reg_pair.second();
834 assert(r_2->is_valid() == wide, "invalid size");
835 if (!r_1->is_valid()) {
836 assert(!r_2->is_valid(), "must be invalid");
837 return;
838 }
839
840 if (!r_1->is_XMMRegister()) {
841 Register val = rax;
842 if (r_1->is_stack()) {
843 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
844 __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
845 } else {
846 val = r_1->as_Register();
847 }
848 assert_different_registers(to.base(), val, rscratch1);
849 if (is_oop) {
850 __ push(r13);
851 __ push(rbx);
852 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
853 __ push(to.base());
854 __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
855 __ pop(to.base());
856 __ pop(rbx);
857 __ pop(r13);
858 } else {
859 __ store_sized_value(to, val, size_in_bytes);
860 }
861 } else {
862 if (wide) {
863 __ movdbl(to, r_1->as_XMMRegister());
864 } else {
865 __ movflt(to, r_1->as_XMMRegister());
866 }
867 }
868 }
869
870 static void gen_c2i_adapter(MacroAssembler *masm,
871 const GrowableArray<SigEntry>* sig_extended,
872 const VMRegPair *regs,
873 bool requires_clinit_barrier,
874 address& c2i_no_clinit_check_entry,
875 Label& skip_fixup,
876 address start,
877 OopMapSet* oop_maps,
878 int& frame_complete,
879 int& frame_size_in_words,
880 bool alloc_inline_receiver) {
881 if (requires_clinit_barrier) {
882 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
883 Label L_skip_barrier;
884 Register method = rbx;
885
886 { // Bypass the barrier for non-static methods
887 Register flags = rscratch1;
888 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
889 __ testl(flags, JVM_ACC_STATIC);
890 __ jcc(Assembler::zero, L_skip_barrier); // non-static
891 }
892
893 Register klass = rscratch1;
894 __ load_method_holder(klass, method);
895 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
896
897 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
898
899 __ bind(L_skip_barrier);
900 c2i_no_clinit_check_entry = __ pc();
901 }
902
903 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
904 bs->c2i_entry_barrier(masm);
905
906 // Before we get into the guts of the C2I adapter, see if we should be here
907 // at all. We've come from compiled code and are attempting to jump to the
908 // interpreter, which means the caller made a static call to get here
909 // (vcalls always get a compiled target if there is one). Check for a
910 // compiled target. If there is one, we need to patch the caller's call.
911 patch_callers_callsite(masm);
912
913 __ bind(skip_fixup);
914
915 if (InlineTypePassFieldsAsArgs) {
916 // Is there an inline type argument?
917 bool has_inline_argument = false;
918 for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
919 has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
920 }
921 if (has_inline_argument) {
922 // There is at least an inline type argument: we're coming from
923 // compiled code so we have no buffers to back the inline types.
924 // Allocate the buffers here with a runtime call.
925 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
926
927 frame_complete = __ offset();
928
929 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
930
931 __ mov(c_rarg0, r15_thread);
932 __ mov(c_rarg1, rbx);
933 __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
934 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
935
936 oop_maps->add_gc_map((int)(__ pc() - start), map);
937 __ reset_last_Java_frame(false);
938
939 RegisterSaver::restore_live_registers(masm);
940
941 Label no_exception;
942 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
943 __ jcc(Assembler::equal, no_exception);
944
945 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
946 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
947 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
948
949 __ bind(no_exception);
950
951 // We get an array of objects from the runtime call
952 __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
953 __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
954 }
955 }
956
957 // Since all args are passed on the stack, total_args_passed *
958 // Interpreter::stackElementSize is the space we need.
959 int total_args_passed = compute_total_args_passed_int(sig_extended);
960 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
961
962 int extraspace = (total_args_passed * Interpreter::stackElementSize);
963
964 // stack is aligned, keep it that way
965 // This is not currently needed or enforced by the interpreter, but
966 // we might as well conform to the ABI.
967 extraspace = align_up(extraspace, 2*wordSize);
968
969 // set senderSP value
970 __ lea(r13, Address(rsp, wordSize));
971
972 #ifdef ASSERT
973 __ check_stack_alignment(r13, "sender stack not aligned");
974 #endif
975 if (extraspace > 0) {
976 // Pop the return address
977 __ pop(rax);
978
979 __ subptr(rsp, extraspace);
980
981 // Push the return address
982 __ push(rax);
983
984 // Account for the return address location since we store it first rather
985 // than hold it in a register across all the shuffling
986 extraspace += wordSize;
987 }
988
989 #ifdef ASSERT
990 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
991 #endif
992
993 // Now write the args into the outgoing interpreter space
994
995 // next_arg_comp is the next argument from the compiler point of
996 // view (inline type fields are passed in registers/on the stack). In
997 // sig_extended, an inline type argument starts with: T_METADATA,
998 // followed by the types of the fields of the inline type and T_VOID
999 // to mark the end of the inline type. ignored counts the number of
1000 // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1001 // used to get the buffer for that argument from the pool of buffers
1002 // we allocated above and want to pass to the
1003 // interpreter. next_arg_int is the next argument from the
1004 // interpreter point of view (inline types are passed by reference).
1005 for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1006 next_arg_comp < sig_extended->length(); next_arg_comp++) {
1007 assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1008 assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1009 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1010 int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1011 if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1012 int next_off = st_off - Interpreter::stackElementSize;
1013 const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1014 const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1015 size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1016 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1017 size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1018 next_arg_int++;
1019 #ifdef ASSERT
1020 if (bt == T_LONG || bt == T_DOUBLE) {
1021 // Overwrite the unused slot with known junk
1022 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1023 __ movptr(Address(rsp, st_off), rax);
1024 }
1025 #endif /* ASSERT */
1026 } else {
1027 ignored++;
1028 next_arg_int++;
1029 int vt = 1;
1030 // write fields we get from compiled code in registers/stack
1031 // slots to the buffer: we know we are done with that inline type
1032 // argument when we hit the T_VOID that acts as an end of inline
1033 // type delimiter for this inline type. Inline types are flattened
1034 // so we might encounter embedded inline types. Each entry in
1035 // sig_extended contains a field offset in the buffer.
1036 Label L_null;
1037 Label not_null_buffer;
1038 do {
1039 next_arg_comp++;
1040 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1041 BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1042 if (bt == T_METADATA) {
1043 vt++;
1044 ignored++;
1045 } else if (bt == T_VOID &&
1046 prev_bt != T_LONG &&
1047 prev_bt != T_DOUBLE) {
1048 vt--;
1049 ignored++;
1050 } else if (sig_extended->at(next_arg_comp)._vt_oop) {
1051 // buffer argument: use if non null
1052 VMReg buffer = regs[next_arg_comp-ignored].first();
1053 if (buffer->is_stack()) {
1054 int ld_off = buffer->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1055 __ movptr(r14, Address(rsp, ld_off));
1056 } else {
1057 __ movptr(r14, buffer->as_Register());
1058 }
1059 __ testptr(r14, r14);
1060 __ jcc(Assembler::notEqual, not_null_buffer);
1061 // otherwise get the buffer from the just allocated pool of buffers
1062 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1063 __ load_heap_oop(r14, Address(rscratch2, index));
1064 next_vt_arg++;
1065 } else {
1066 int off = sig_extended->at(next_arg_comp)._offset;
1067 if (off == -1) {
1068 // Nullable inline type argument, emit null check
1069 VMReg reg = regs[next_arg_comp-ignored].first();
1070 Label L_notNull;
1071 if (reg->is_stack()) {
1072 int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1073 __ testb(Address(rsp, ld_off), 1);
1074 } else {
1075 __ testb(reg->as_Register(), 1);
1076 }
1077 __ jcc(Assembler::notZero, L_notNull);
1078 __ movptr(Address(rsp, st_off), 0);
1079 __ jmp(L_null);
1080 __ bind(L_notNull);
1081 continue;
1082 }
1083 assert(off > 0, "offset in object should be positive");
1084 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1085 bool is_oop = is_reference_type(bt);
1086 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1087 size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1088 }
1089 } while (vt != 0);
1090 // pass the buffer to the interpreter
1091 __ bind(not_null_buffer);
1092 __ movptr(Address(rsp, st_off), r14);
1093 __ bind(L_null);
1094 }
1095 }
1096
1097 // Schedule the branch target address early.
1098 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1099 __ jmp(rcx);
1100 }
1101
1102 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1103 int comp_args_on_stack,
1104 const GrowableArray<SigEntry>* sig,
1105 const VMRegPair *regs) {
1106
1107 // Note: r13 contains the senderSP on entry. We must preserve it since
1108 // we may do a i2c -> c2i transition if we lose a race where compiled
1109 // code goes non-entrant while we get args ready.
1110 // In addition we use r13 to locate all the interpreter args as
1111 // we must align the stack to 16 bytes on an i2c entry else we
1112 // lose alignment we expect in all compiled code and register
1113 // save code can segv when fxsave instructions find improperly
1114 // aligned stack pointer.
1115
1116 // Adapters can be frameless because they do not require the caller
1117 // to perform additional cleanup work, such as correcting the stack pointer.
1118 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1119 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1120 // even if a callee has modified the stack pointer.
1121 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1122 // routinely repairs its caller's stack pointer (from sender_sp, which is set
1123 // up via the senderSP register).
1124 // In other words, if *either* the caller or callee is interpreted, we can
1125 // get the stack pointer repaired after a call.
1126 // This is why c2i and i2c adapters cannot be indefinitely composed.
1127 // In particular, if a c2i adapter were to somehow call an i2c adapter,
1128 // both caller and callee would be compiled methods, and neither would
1129 // clean up the stack pointer changes performed by the two adapters.
1130 // If this happens, control eventually transfers back to the compiled
1131 // caller, but with an uncorrected stack, causing delayed havoc.
1132
1133 // Must preserve original SP for loading incoming arguments because
1134 // we need to align the outgoing SP for compiled code.
1135 __ movptr(r11, rsp);
1136
1137 // Pick up the return address
1138 __ pop(rax);
1139
1140 // Convert 4-byte c2 stack slots to words.
1141 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1142
1143 if (comp_args_on_stack) {
1144 __ subptr(rsp, comp_words_on_stack * wordSize);
1145 }
1146
1147 // Ensure compiled code always sees stack at proper alignment
1148 __ andptr(rsp, -16);
1149
1150 // push the return address and misalign the stack that youngest frame always sees
1151 // as far as the placement of the call instruction
1152 __ push(rax);
1153
1154 // Put saved SP in another register
1155 const Register saved_sp = rax;
1156 __ movptr(saved_sp, r11);
1157
1158 // Will jump to the compiled code just as if compiled code was doing it.
1159 // Pre-load the register-jump target early, to schedule it better.
1160 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1161
1162 #if INCLUDE_JVMCI
1163 if (EnableJVMCI) {
1164 // check if this call should be routed towards a specific entry point
1165 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1166 Label no_alternative_target;
1167 __ jcc(Assembler::equal, no_alternative_target);
1168 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1169 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1170 __ bind(no_alternative_target);
1171 }
1172 #endif // INCLUDE_JVMCI
1173
1174 int total_args_passed = sig->length();
1175
1176 // Now generate the shuffle code. Pick up all register args and move the
1177 // rest through the floating point stack top.
1178 for (int i = 0; i < total_args_passed; i++) {
1179 BasicType bt = sig->at(i)._bt;
1180 if (bt == T_VOID) {
1181 // Longs and doubles are passed in native word order, but misaligned
1182 // in the 32-bit build.
1183 BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1184 assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1185 continue;
1186 }
1187
1188 // Pick up 0, 1 or 2 words from SP+offset.
1189
1190 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1191 "scrambled load targets?");
1192 // Load in argument order going down.
1193 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1194 // Point to interpreter value (vs. tag)
1195 int next_off = ld_off - Interpreter::stackElementSize;
1196 //
1197 //
1198 //
1199 VMReg r_1 = regs[i].first();
1200 VMReg r_2 = regs[i].second();
1201 if (!r_1->is_valid()) {
1202 assert(!r_2->is_valid(), "");
1203 continue;
1204 }
1205 if (r_1->is_stack()) {
1206 // Convert stack slot to an SP offset (+ wordSize to account for return address )
1207 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1208
1209 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1210 // and if we end up going thru a c2i because of a miss a reasonable value of r13
1211 // will be generated.
1212 if (!r_2->is_valid()) {
1213 // sign extend???
1214 __ movl(r13, Address(saved_sp, ld_off));
1215 __ movptr(Address(rsp, st_off), r13);
1216 } else {
1217 //
1218 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1219 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1220 // So we must adjust where to pick up the data to match the interpreter.
1221 //
1222 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1223 // are accessed as negative so LSW is at LOW address
1224
1225 // ld_off is MSW so get LSW
1226 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1227 next_off : ld_off;
1228 __ movq(r13, Address(saved_sp, offset));
1229 // st_off is LSW (i.e. reg.first())
1230 __ movq(Address(rsp, st_off), r13);
1231 }
1232 } else if (r_1->is_Register()) { // Register argument
1233 Register r = r_1->as_Register();
1234 assert(r != rax, "must be different");
1235 if (r_2->is_valid()) {
1236 //
1237 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1238 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1239 // So we must adjust where to pick up the data to match the interpreter.
1240
1241 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1242 next_off : ld_off;
1243
1244 // this can be a misaligned move
1245 __ movq(r, Address(saved_sp, offset));
1246 } else {
1247 // sign extend and use a full word?
1248 __ movl(r, Address(saved_sp, ld_off));
1249 }
1250 } else {
1251 if (!r_2->is_valid()) {
1252 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1253 } else {
1254 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1255 }
1256 }
1257 }
1258
1259 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1260
1261 // 6243940 We might end up in handle_wrong_method if
1262 // the callee is deoptimized as we race thru here. If that
1263 // happens we don't want to take a safepoint because the
1264 // caller frame will look interpreted and arguments are now
1265 // "compiled" so it is much better to make this transition
1266 // invisible to the stack walking code. Unfortunately if
1267 // we try and find the callee by normal means a safepoint
1268 // is possible. So we stash the desired callee in the thread
1269 // and the vm will find there should this case occur.
1270
1271 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1272
1273 // put Method* where a c2i would expect should we end up there
1274 // only needed because of c2 resolve stubs return Method* as a result in
1275 // rax
1276 __ mov(rax, rbx);
1277 __ jmp(r11);
1278 }
1279
1280 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1281 Register data = rax;
1282 __ ic_check(1 /* end_alignment */);
1283 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1284
1285 // Method might have been compiled since the call site was patched to
1286 // interpreted if that is the case treat it as a miss so we can get
1287 // the call site corrected.
1288 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1289 __ jcc(Assembler::equal, skip_fixup);
1290 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1291 }
1292
1293 // ---------------------------------------------------------------
1294 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1295 int comp_args_on_stack,
1296 const GrowableArray<SigEntry>* sig,
1297 const VMRegPair* regs,
1298 const GrowableArray<SigEntry>* sig_cc,
1299 const VMRegPair* regs_cc,
1300 const GrowableArray<SigEntry>* sig_cc_ro,
1301 const VMRegPair* regs_cc_ro,
1302 address entry_address[AdapterBlob::ENTRY_COUNT],
1303 AdapterBlob*& new_adapter,
1304 bool allocate_code_blob) {
1305 entry_address[AdapterBlob::I2C] = __ pc();
1306 gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1307
1308 // -------------------------------------------------------------------------
1309 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1310 // to the interpreter. The args start out packed in the compiled layout. They
1311 // need to be unpacked into the interpreter layout. This will almost always
1312 // require some stack space. We grow the current (compiled) stack, then repack
1313 // the args. We finally end in a jump to the generic interpreter entry point.
1314 // On exit from the interpreter, the interpreter will restore our SP (lest the
1315 // compiled code, which relies solely on SP and not RBP, get sick).
1316
1317 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1318 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1319 Label skip_fixup;
1320
1321 gen_inline_cache_check(masm, skip_fixup);
1322
1323 OopMapSet* oop_maps = new OopMapSet();
1324 int frame_complete = CodeOffsets::frame_never_safe;
1325 int frame_size_in_words = 0;
1326
1327 // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1328 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1329 entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1330 if (regs_cc != regs_cc_ro) {
1331 // No class init barrier needed because method is guaranteed to be non-static
1332 gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1333 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1334 skip_fixup.reset();
1335 }
1336
1337 // Scalarized c2i adapter
1338 entry_address[AdapterBlob::C2I] = __ pc();
1339 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1340 gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1341 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1342
1343 // Non-scalarized c2i adapter
1344 if (regs != regs_cc) {
1345 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1346 Label inline_entry_skip_fixup;
1347 gen_inline_cache_check(masm, inline_entry_skip_fixup);
1348
1349 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1350 gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1351 inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1352 }
1353
1354 // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1355 // the GC knows about the location of oop argument locations passed to the c2i adapter.
1356 if (allocate_code_blob) {
1357 bool caller_must_gc_arguments = (regs != regs_cc);
1358 int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1359 assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1360 AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1361 new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1362 }
1363 }
1364
1365 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1366 VMRegPair *regs,
1367 int total_args_passed) {
1368
1369 // We return the amount of VMRegImpl stack slots we need to reserve for all
1370 // the arguments NOT counting out_preserve_stack_slots.
1371
1372 // NOTE: These arrays will have to change when c1 is ported
1373 #ifdef _WIN64
1374 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1375 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1376 };
1377 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1378 c_farg0, c_farg1, c_farg2, c_farg3
1379 };
1380 #else
1381 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1382 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1383 };
1384 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1385 c_farg0, c_farg1, c_farg2, c_farg3,
1386 c_farg4, c_farg5, c_farg6, c_farg7
1387 };
1388 #endif // _WIN64
1389
1390
1391 uint int_args = 0;
1392 uint fp_args = 0;
1393 uint stk_args = 0; // inc by 2 each time
1394
1395 for (int i = 0; i < total_args_passed; i++) {
1396 switch (sig_bt[i]) {
1397 case T_BOOLEAN:
1398 case T_CHAR:
1399 case T_BYTE:
1400 case T_SHORT:
1401 case T_INT:
1402 if (int_args < Argument::n_int_register_parameters_c) {
1403 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1404 #ifdef _WIN64
1405 fp_args++;
1406 // Allocate slots for callee to stuff register args the stack.
1407 stk_args += 2;
1408 #endif
1409 } else {
1410 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1411 stk_args += 2;
1412 }
1413 break;
1414 case T_LONG:
1415 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1416 // fall through
1417 case T_OBJECT:
1418 case T_ARRAY:
1419 case T_ADDRESS:
1420 case T_METADATA:
1421 if (int_args < Argument::n_int_register_parameters_c) {
1422 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1423 #ifdef _WIN64
1424 fp_args++;
1425 stk_args += 2;
1426 #endif
1427 } else {
1428 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1429 stk_args += 2;
1430 }
1431 break;
1432 case T_FLOAT:
1433 if (fp_args < Argument::n_float_register_parameters_c) {
1434 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1435 #ifdef _WIN64
1436 int_args++;
1437 // Allocate slots for callee to stuff register args the stack.
1438 stk_args += 2;
1439 #endif
1440 } else {
1441 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1442 stk_args += 2;
1443 }
1444 break;
1445 case T_DOUBLE:
1446 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1447 if (fp_args < Argument::n_float_register_parameters_c) {
1448 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1449 #ifdef _WIN64
1450 int_args++;
1451 // Allocate slots for callee to stuff register args the stack.
1452 stk_args += 2;
1453 #endif
1454 } else {
1455 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1456 stk_args += 2;
1457 }
1458 break;
1459 case T_VOID: // Halves of longs and doubles
1460 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1461 regs[i].set_bad();
1462 break;
1463 default:
1464 ShouldNotReachHere();
1465 break;
1466 }
1467 }
1468 #ifdef _WIN64
1469 // windows abi requires that we always allocate enough stack space
1470 // for 4 64bit registers to be stored down.
1471 if (stk_args < 8) {
1472 stk_args = 8;
1473 }
1474 #endif // _WIN64
1475
1476 return stk_args;
1477 }
1478
1479 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1480 uint num_bits,
1481 uint total_args_passed) {
1482 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1483 "only certain vector sizes are supported for now");
1484
1485 static const XMMRegister VEC_ArgReg[32] = {
1486 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1487 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1488 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1489 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1490 };
1491
1492 uint stk_args = 0;
1493 uint fp_args = 0;
1494
1495 for (uint i = 0; i < total_args_passed; i++) {
1496 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1497 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1498 regs[i].set_pair(vmreg->next(next_val), vmreg);
1499 }
1500
1501 return stk_args;
1502 }
1503
1504 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1505 // We always ignore the frame_slots arg and just use the space just below frame pointer
1506 // which by this time is free to use
1507 switch (ret_type) {
1508 case T_FLOAT:
1509 __ movflt(Address(rbp, -wordSize), xmm0);
1510 break;
1511 case T_DOUBLE:
1512 __ movdbl(Address(rbp, -wordSize), xmm0);
1513 break;
1514 case T_VOID: break;
1515 default: {
1516 __ movptr(Address(rbp, -wordSize), rax);
1517 }
1518 }
1519 }
1520
1521 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1522 // We always ignore the frame_slots arg and just use the space just below frame pointer
1523 // which by this time is free to use
1524 switch (ret_type) {
1525 case T_FLOAT:
1526 __ movflt(xmm0, Address(rbp, -wordSize));
1527 break;
1528 case T_DOUBLE:
1529 __ movdbl(xmm0, Address(rbp, -wordSize));
1530 break;
1531 case T_VOID: break;
1532 default: {
1533 __ movptr(rax, Address(rbp, -wordSize));
1534 }
1535 }
1536 }
1537
1538 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1539 for ( int i = first_arg ; i < arg_count ; i++ ) {
1540 if (args[i].first()->is_Register()) {
1541 __ push(args[i].first()->as_Register());
1542 } else if (args[i].first()->is_XMMRegister()) {
1543 __ subptr(rsp, 2*wordSize);
1544 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1545 }
1546 }
1547 }
1548
1549 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1550 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1551 if (args[i].first()->is_Register()) {
1552 __ pop(args[i].first()->as_Register());
1553 } else if (args[i].first()->is_XMMRegister()) {
1554 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1555 __ addptr(rsp, 2*wordSize);
1556 }
1557 }
1558 }
1559
1560 static void verify_oop_args(MacroAssembler* masm,
1561 const methodHandle& method,
1562 const BasicType* sig_bt,
1563 const VMRegPair* regs) {
1564 Register temp_reg = rbx; // not part of any compiled calling seq
1565 if (VerifyOops) {
1566 for (int i = 0; i < method->size_of_parameters(); i++) {
1567 if (is_reference_type(sig_bt[i])) {
1568 VMReg r = regs[i].first();
1569 assert(r->is_valid(), "bad oop arg");
1570 if (r->is_stack()) {
1571 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1572 __ verify_oop(temp_reg);
1573 } else {
1574 __ verify_oop(r->as_Register());
1575 }
1576 }
1577 }
1578 }
1579 }
1580
1581 static void check_continuation_enter_argument(VMReg actual_vmreg,
1582 Register expected_reg,
1583 const char* name) {
1584 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1585 assert(actual_vmreg->as_Register() == expected_reg,
1586 "%s is in unexpected register: %s instead of %s",
1587 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1588 }
1589
1590
1591 //---------------------------- continuation_enter_setup ---------------------------
1592 //
1593 // Arguments:
1594 // None.
1595 //
1596 // Results:
1597 // rsp: pointer to blank ContinuationEntry
1598 //
1599 // Kills:
1600 // rax
1601 //
1602 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1603 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1604 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1605 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1606
1607 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1608 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1609
1610 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1611 OopMap* map = new OopMap(frame_size, 0);
1612
1613 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1614 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1615 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1616
1617 return map;
1618 }
1619
1620 //---------------------------- fill_continuation_entry ---------------------------
1621 //
1622 // Arguments:
1623 // rsp: pointer to blank Continuation entry
1624 // reg_cont_obj: pointer to the continuation
1625 // reg_flags: flags
1626 //
1627 // Results:
1628 // rsp: pointer to filled out ContinuationEntry
1629 //
1630 // Kills:
1631 // rax
1632 //
1633 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1634 assert_different_registers(rax, reg_cont_obj, reg_flags);
1635 #ifdef ASSERT
1636 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1637 #endif
1638 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1639 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1640 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1641 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1642 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1643
1644 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1645 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1646
1647 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1648 }
1649
1650 //---------------------------- continuation_enter_cleanup ---------------------------
1651 //
1652 // Arguments:
1653 // rsp: pointer to the ContinuationEntry
1654 //
1655 // Results:
1656 // rsp: pointer to the spilled rbp in the entry frame
1657 //
1658 // Kills:
1659 // rbx
1660 //
1661 static void continuation_enter_cleanup(MacroAssembler* masm) {
1662 #ifdef ASSERT
1663 Label L_good_sp;
1664 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1665 __ jcc(Assembler::equal, L_good_sp);
1666 __ stop("Incorrect rsp at continuation_enter_cleanup");
1667 __ bind(L_good_sp);
1668 #endif
1669 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1670 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1671 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1672 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1673 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1674 }
1675
1676 static void gen_continuation_enter(MacroAssembler* masm,
1677 const VMRegPair* regs,
1678 int& exception_offset,
1679 OopMapSet* oop_maps,
1680 int& frame_complete,
1681 int& stack_slots,
1682 int& interpreted_entry_offset,
1683 int& compiled_entry_offset) {
1684
1685 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1686 int pos_cont_obj = 0;
1687 int pos_is_cont = 1;
1688 int pos_is_virtual = 2;
1689
1690 // The platform-specific calling convention may present the arguments in various registers.
1691 // To simplify the rest of the code, we expect the arguments to reside at these known
1692 // registers, and we additionally check the placement here in case calling convention ever
1693 // changes.
1694 Register reg_cont_obj = c_rarg1;
1695 Register reg_is_cont = c_rarg2;
1696 Register reg_is_virtual = c_rarg3;
1697
1698 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1699 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1700 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1701
1702 // Utility methods kill rax, make sure there are no collisions
1703 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1704
1705 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1706 relocInfo::static_call_type);
1707
1708 address start = __ pc();
1709
1710 Label L_thaw, L_exit;
1711
1712 // i2i entry used at interp_only_mode only
1713 interpreted_entry_offset = __ pc() - start;
1714 {
1715 #ifdef ASSERT
1716 Label is_interp_only;
1717 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1718 __ jcc(Assembler::notEqual, is_interp_only);
1719 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1720 __ bind(is_interp_only);
1721 #endif
1722
1723 __ pop(rax); // return address
1724 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1725 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1726 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1727 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1728 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1729 __ push(rax); // return address
1730 __ push_cont_fastpath();
1731
1732 __ enter();
1733
1734 stack_slots = 2; // will be adjusted in setup
1735 OopMap* map = continuation_enter_setup(masm, stack_slots);
1736 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1737 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1738
1739 __ verify_oop(reg_cont_obj);
1740
1741 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1742
1743 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1744 __ testptr(reg_is_cont, reg_is_cont);
1745 __ jcc(Assembler::notZero, L_thaw);
1746
1747 // --- Resolve path
1748
1749 // Make sure the call is patchable
1750 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1751 // Emit stub for static call
1752 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1753 if (stub == nullptr) {
1754 fatal("CodeCache is full at gen_continuation_enter");
1755 }
1756 __ call(resolve);
1757 oop_maps->add_gc_map(__ pc() - start, map);
1758 __ post_call_nop();
1759
1760 __ jmp(L_exit);
1761 }
1762
1763 // compiled entry
1764 __ align(CodeEntryAlignment);
1765 compiled_entry_offset = __ pc() - start;
1766 __ enter();
1767
1768 stack_slots = 2; // will be adjusted in setup
1769 OopMap* map = continuation_enter_setup(masm, stack_slots);
1770
1771 // Frame is now completed as far as size and linkage.
1772 frame_complete = __ pc() - start;
1773
1774 __ verify_oop(reg_cont_obj);
1775
1776 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1777
1778 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1779 __ testptr(reg_is_cont, reg_is_cont);
1780 __ jccb(Assembler::notZero, L_thaw);
1781
1782 // --- call Continuation.enter(Continuation c, boolean isContinue)
1783
1784 // Make sure the call is patchable
1785 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1786
1787 // Emit stub for static call
1788 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1789 if (stub == nullptr) {
1790 fatal("CodeCache is full at gen_continuation_enter");
1791 }
1792
1793 // The call needs to be resolved. There's a special case for this in
1794 // SharedRuntime::find_callee_info_helper() which calls
1795 // LinkResolver::resolve_continuation_enter() which resolves the call to
1796 // Continuation.enter(Continuation c, boolean isContinue).
1797 __ call(resolve);
1798
1799 oop_maps->add_gc_map(__ pc() - start, map);
1800 __ post_call_nop();
1801
1802 __ jmpb(L_exit);
1803
1804 // --- Thawing path
1805
1806 __ bind(L_thaw);
1807
1808 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1809 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1810
1811 ContinuationEntry::_return_pc_offset = __ pc() - start;
1812 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1813 __ post_call_nop();
1814
1815 // --- Normal exit (resolve/thawing)
1816
1817 __ bind(L_exit);
1818 ContinuationEntry::_cleanup_offset = __ pc() - start;
1819 continuation_enter_cleanup(masm);
1820 __ pop(rbp);
1821 __ ret(0);
1822
1823 // --- Exception handling path
1824
1825 exception_offset = __ pc() - start;
1826
1827 continuation_enter_cleanup(masm);
1828 __ pop(rbp);
1829
1830 __ movptr(c_rarg0, r15_thread);
1831 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1832
1833 // rax still holds the original exception oop, save it before the call
1834 __ push(rax);
1835
1836 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1837 __ movptr(rbx, rax);
1838
1839 // Continue at exception handler:
1840 // rax: exception oop
1841 // rbx: exception handler
1842 // rdx: exception pc
1843 __ pop(rax);
1844 __ verify_oop(rax);
1845 __ pop(rdx);
1846 __ jmp(rbx);
1847 }
1848
1849 static void gen_continuation_yield(MacroAssembler* masm,
1850 const VMRegPair* regs,
1851 OopMapSet* oop_maps,
1852 int& frame_complete,
1853 int& stack_slots,
1854 int& compiled_entry_offset) {
1855 enum layout {
1856 rbp_off,
1857 rbpH_off,
1858 return_off,
1859 return_off2,
1860 framesize // inclusive of return address
1861 };
1862 stack_slots = framesize / VMRegImpl::slots_per_word;
1863 assert(stack_slots == 2, "recheck layout");
1864
1865 address start = __ pc();
1866 compiled_entry_offset = __ pc() - start;
1867 __ enter();
1868 address the_pc = __ pc();
1869
1870 frame_complete = the_pc - start;
1871
1872 // This nop must be exactly at the PC we push into the frame info.
1873 // We use this nop for fast CodeBlob lookup, associate the OopMap
1874 // with it right away.
1875 __ post_call_nop();
1876 OopMap* map = new OopMap(framesize, 1);
1877 oop_maps->add_gc_map(frame_complete, map);
1878
1879 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1880 __ movptr(c_rarg0, r15_thread);
1881 __ movptr(c_rarg1, rsp);
1882 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1883 __ reset_last_Java_frame(true);
1884
1885 Label L_pinned;
1886
1887 __ testptr(rax, rax);
1888 __ jcc(Assembler::notZero, L_pinned);
1889
1890 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1891 continuation_enter_cleanup(masm);
1892 __ pop(rbp);
1893 __ ret(0);
1894
1895 __ bind(L_pinned);
1896
1897 // Pinned, return to caller
1898
1899 // handle pending exception thrown by freeze
1900 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1901 Label ok;
1902 __ jcc(Assembler::equal, ok);
1903 __ leave();
1904 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1905 __ bind(ok);
1906
1907 __ leave();
1908 __ ret(0);
1909 }
1910
1911 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1912 ::continuation_enter_cleanup(masm);
1913 }
1914
1915 static void gen_special_dispatch(MacroAssembler* masm,
1916 const methodHandle& method,
1917 const BasicType* sig_bt,
1918 const VMRegPair* regs) {
1919 verify_oop_args(masm, method, sig_bt, regs);
1920 vmIntrinsics::ID iid = method->intrinsic_id();
1921
1922 // Now write the args into the outgoing interpreter space
1923 bool has_receiver = false;
1924 Register receiver_reg = noreg;
1925 int member_arg_pos = -1;
1926 Register member_reg = noreg;
1927 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1928 if (ref_kind != 0) {
1929 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1930 member_reg = rbx; // known to be free at this point
1931 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1932 } else if (iid == vmIntrinsics::_invokeBasic) {
1933 has_receiver = true;
1934 } else if (iid == vmIntrinsics::_linkToNative) {
1935 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1936 member_reg = rbx; // known to be free at this point
1937 } else {
1938 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1939 }
1940
1941 if (member_reg != noreg) {
1942 // Load the member_arg into register, if necessary.
1943 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1944 VMReg r = regs[member_arg_pos].first();
1945 if (r->is_stack()) {
1946 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1947 } else {
1948 // no data motion is needed
1949 member_reg = r->as_Register();
1950 }
1951 }
1952
1953 if (has_receiver) {
1954 // Make sure the receiver is loaded into a register.
1955 assert(method->size_of_parameters() > 0, "oob");
1956 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1957 VMReg r = regs[0].first();
1958 assert(r->is_valid(), "bad receiver arg");
1959 if (r->is_stack()) {
1960 // Porting note: This assumes that compiled calling conventions always
1961 // pass the receiver oop in a register. If this is not true on some
1962 // platform, pick a temp and load the receiver from stack.
1963 fatal("receiver always in a register");
1964 receiver_reg = j_rarg0; // known to be free at this point
1965 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1966 } else {
1967 // no data motion is needed
1968 receiver_reg = r->as_Register();
1969 }
1970 }
1971
1972 // Figure out which address we are really jumping to:
1973 MethodHandles::generate_method_handle_dispatch(masm, iid,
1974 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1975 }
1976
1977 // ---------------------------------------------------------------------------
1978 // Generate a native wrapper for a given method. The method takes arguments
1979 // in the Java compiled code convention, marshals them to the native
1980 // convention (handlizes oops, etc), transitions to native, makes the call,
1981 // returns to java state (possibly blocking), unhandlizes any result and
1982 // returns.
1983 //
1984 // Critical native functions are a shorthand for the use of
1985 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1986 // functions. The wrapper is expected to unpack the arguments before
1987 // passing them to the callee. Critical native functions leave the state _in_Java,
1988 // since they cannot stop for GC.
1989 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1990 // block and the check for pending exceptions it's impossible for them
1991 // to be thrown.
1992 //
1993 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1994 const methodHandle& method,
1995 int compile_id,
1996 BasicType* in_sig_bt,
1997 VMRegPair* in_regs,
1998 BasicType ret_type) {
1999 if (method->is_continuation_native_intrinsic()) {
2000 int exception_offset = -1;
2001 OopMapSet* oop_maps = new OopMapSet();
2002 int frame_complete = -1;
2003 int stack_slots = -1;
2004 int interpreted_entry_offset = -1;
2005 int vep_offset = -1;
2006 if (method->is_continuation_enter_intrinsic()) {
2007 gen_continuation_enter(masm,
2008 in_regs,
2009 exception_offset,
2010 oop_maps,
2011 frame_complete,
2012 stack_slots,
2013 interpreted_entry_offset,
2014 vep_offset);
2015 } else if (method->is_continuation_yield_intrinsic()) {
2016 gen_continuation_yield(masm,
2017 in_regs,
2018 oop_maps,
2019 frame_complete,
2020 stack_slots,
2021 vep_offset);
2022 } else {
2023 guarantee(false, "Unknown Continuation native intrinsic");
2024 }
2025
2026 #ifdef ASSERT
2027 if (method->is_continuation_enter_intrinsic()) {
2028 assert(interpreted_entry_offset != -1, "Must be set");
2029 assert(exception_offset != -1, "Must be set");
2030 } else {
2031 assert(interpreted_entry_offset == -1, "Must be unset");
2032 assert(exception_offset == -1, "Must be unset");
2033 }
2034 assert(frame_complete != -1, "Must be set");
2035 assert(stack_slots != -1, "Must be set");
2036 assert(vep_offset != -1, "Must be set");
2037 #endif
2038
2039 __ flush();
2040 nmethod* nm = nmethod::new_native_nmethod(method,
2041 compile_id,
2042 masm->code(),
2043 vep_offset,
2044 frame_complete,
2045 stack_slots,
2046 in_ByteSize(-1),
2047 in_ByteSize(-1),
2048 oop_maps,
2049 exception_offset);
2050 if (nm == nullptr) return nm;
2051 if (method->is_continuation_enter_intrinsic()) {
2052 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2053 } else if (method->is_continuation_yield_intrinsic()) {
2054 _cont_doYield_stub = nm;
2055 }
2056 return nm;
2057 }
2058
2059 if (method->is_method_handle_intrinsic()) {
2060 vmIntrinsics::ID iid = method->intrinsic_id();
2061 intptr_t start = (intptr_t)__ pc();
2062 int vep_offset = ((intptr_t)__ pc()) - start;
2063 gen_special_dispatch(masm,
2064 method,
2065 in_sig_bt,
2066 in_regs);
2067 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
2068 __ flush();
2069 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
2070 return nmethod::new_native_nmethod(method,
2071 compile_id,
2072 masm->code(),
2073 vep_offset,
2074 frame_complete,
2075 stack_slots / VMRegImpl::slots_per_word,
2076 in_ByteSize(-1),
2077 in_ByteSize(-1),
2078 nullptr);
2079 }
2080 address native_func = method->native_function();
2081 assert(native_func != nullptr, "must have function");
2082
2083 // An OopMap for lock (and class if static)
2084 OopMapSet *oop_maps = new OopMapSet();
2085 intptr_t start = (intptr_t)__ pc();
2086
2087 // We have received a description of where all the java arg are located
2088 // on entry to the wrapper. We need to convert these args to where
2089 // the jni function will expect them. To figure out where they go
2090 // we convert the java signature to a C signature by inserting
2091 // the hidden arguments as arg[0] and possibly arg[1] (static method)
2092
2093 const int total_in_args = method->size_of_parameters();
2094 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2095
2096 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2097 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2098
2099 int argc = 0;
2100 out_sig_bt[argc++] = T_ADDRESS;
2101 if (method->is_static()) {
2102 out_sig_bt[argc++] = T_OBJECT;
2103 }
2104
2105 for (int i = 0; i < total_in_args ; i++ ) {
2106 out_sig_bt[argc++] = in_sig_bt[i];
2107 }
2108
2109 // Now figure out where the args must be stored and how much stack space
2110 // they require.
2111 int out_arg_slots;
2112 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2113
2114 // Compute framesize for the wrapper. We need to handlize all oops in
2115 // incoming registers
2116
2117 // Calculate the total number of stack slots we will need.
2118
2119 // First count the abi requirement plus all of the outgoing args
2120 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2121
2122 // Now the space for the inbound oop handle area
2123 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
2124
2125 int oop_handle_offset = stack_slots;
2126 stack_slots += total_save_slots;
2127
2128 // Now any space we need for handlizing a klass if static method
2129
2130 int klass_slot_offset = 0;
2131 int klass_offset = -1;
2132 int lock_slot_offset = 0;
2133 bool is_static = false;
2134
2135 if (method->is_static()) {
2136 klass_slot_offset = stack_slots;
2137 stack_slots += VMRegImpl::slots_per_word;
2138 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2139 is_static = true;
2140 }
2141
2142 // Plus a lock if needed
2143
2144 if (method->is_synchronized()) {
2145 lock_slot_offset = stack_slots;
2146 stack_slots += VMRegImpl::slots_per_word;
2147 }
2148
2149 // Now a place (+2) to save return values or temp during shuffling
2150 // + 4 for return address (which we own) and saved rbp
2151 stack_slots += 6;
2152
2153 // Ok The space we have allocated will look like:
2154 //
2155 //
2156 // FP-> | |
2157 // |---------------------|
2158 // | 2 slots for moves |
2159 // |---------------------|
2160 // | lock box (if sync) |
2161 // |---------------------| <- lock_slot_offset
2162 // | klass (if static) |
2163 // |---------------------| <- klass_slot_offset
2164 // | oopHandle area |
2165 // |---------------------| <- oop_handle_offset (6 java arg registers)
2166 // | outbound memory |
2167 // | based arguments |
2168 // | |
2169 // |---------------------|
2170 // | |
2171 // SP-> | out_preserved_slots |
2172 //
2173 //
2174
2175
2176 // Now compute actual number of stack words we need rounding to make
2177 // stack properly aligned.
2178 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2179
2180 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2181
2182 // First thing make an ic check to see if we should even be here
2183
2184 // We are free to use all registers as temps without saving them and
2185 // restoring them except rbp. rbp is the only callee save register
2186 // as far as the interpreter and the compiler(s) are concerned.
2187
2188 const Register receiver = j_rarg0;
2189
2190 Label exception_pending;
2191
2192 assert_different_registers(receiver, rscratch1, rscratch2);
2193 __ verify_oop(receiver);
2194 __ ic_check(8 /* end_alignment */);
2195
2196 int vep_offset = ((intptr_t)__ pc()) - start;
2197
2198 if (method->needs_clinit_barrier()) {
2199 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2200 Label L_skip_barrier;
2201 Register klass = r10;
2202 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2203 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2204
2205 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2206
2207 __ bind(L_skip_barrier);
2208 }
2209
2210 #ifdef COMPILER1
2211 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2212 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2213 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2214 }
2215 #endif // COMPILER1
2216
2217 // The instruction at the verified entry point must be 5 bytes or longer
2218 // because it can be patched on the fly by make_non_entrant. The stack bang
2219 // instruction fits that requirement.
2220
2221 // Generate stack overflow check
2222 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2223
2224 // Generate a new frame for the wrapper.
2225 __ enter();
2226 // -2 because return address is already present and so is saved rbp
2227 __ subptr(rsp, stack_size - 2*wordSize);
2228
2229 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2230 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2231 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2232
2233 // Frame is now completed as far as size and linkage.
2234 int frame_complete = ((intptr_t)__ pc()) - start;
2235
2236 #ifdef ASSERT
2237 __ check_stack_alignment(rsp, "improperly aligned stack");
2238 #endif /* ASSERT */
2239
2240
2241 // We use r14 as the oop handle for the receiver/klass
2242 // It is callee save so it survives the call to native
2243
2244 const Register oop_handle_reg = r14;
2245
2246 //
2247 // We immediately shuffle the arguments so that any vm call we have to
2248 // make from here on out (sync slow path, jvmti, etc.) we will have
2249 // captured the oops from our caller and have a valid oopMap for
2250 // them.
2251
2252 // -----------------
2253 // The Grand Shuffle
2254
2255 // The Java calling convention is either equal (linux) or denser (win64) than the
2256 // c calling convention. However the because of the jni_env argument the c calling
2257 // convention always has at least one more (and two for static) arguments than Java.
2258 // Therefore if we move the args from java -> c backwards then we will never have
2259 // a register->register conflict and we don't have to build a dependency graph
2260 // and figure out how to break any cycles.
2261 //
2262
2263 // Record esp-based slot for receiver on stack for non-static methods
2264 int receiver_offset = -1;
2265
2266 // This is a trick. We double the stack slots so we can claim
2267 // the oops in the caller's frame. Since we are sure to have
2268 // more args than the caller doubling is enough to make
2269 // sure we can capture all the incoming oop args from the
2270 // caller.
2271 //
2272 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2273
2274 // Mark location of rbp (someday)
2275 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2276
2277 // Use eax, ebx as temporaries during any memory-memory moves we have to do
2278 // All inbound args are referenced based on rbp and all outbound args via rsp.
2279
2280
2281 #ifdef ASSERT
2282 bool reg_destroyed[Register::number_of_registers];
2283 bool freg_destroyed[XMMRegister::number_of_registers];
2284 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2285 reg_destroyed[r] = false;
2286 }
2287 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2288 freg_destroyed[f] = false;
2289 }
2290
2291 #endif /* ASSERT */
2292
2293 // For JNI natives the incoming and outgoing registers are offset upwards.
2294 GrowableArray<int> arg_order(2 * total_in_args);
2295
2296 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2297 arg_order.push(i);
2298 arg_order.push(c_arg);
2299 }
2300
2301 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2302 int i = arg_order.at(ai);
2303 int c_arg = arg_order.at(ai + 1);
2304 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2305 #ifdef ASSERT
2306 if (in_regs[i].first()->is_Register()) {
2307 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2308 } else if (in_regs[i].first()->is_XMMRegister()) {
2309 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2310 }
2311 if (out_regs[c_arg].first()->is_Register()) {
2312 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2313 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2314 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2315 }
2316 #endif /* ASSERT */
2317 switch (in_sig_bt[i]) {
2318 case T_ARRAY:
2319 case T_OBJECT:
2320 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2321 ((i == 0) && (!is_static)),
2322 &receiver_offset);
2323 break;
2324 case T_VOID:
2325 break;
2326
2327 case T_FLOAT:
2328 __ float_move(in_regs[i], out_regs[c_arg]);
2329 break;
2330
2331 case T_DOUBLE:
2332 assert( i + 1 < total_in_args &&
2333 in_sig_bt[i + 1] == T_VOID &&
2334 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2335 __ double_move(in_regs[i], out_regs[c_arg]);
2336 break;
2337
2338 case T_LONG :
2339 __ long_move(in_regs[i], out_regs[c_arg]);
2340 break;
2341
2342 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2343
2344 default:
2345 __ move32_64(in_regs[i], out_regs[c_arg]);
2346 }
2347 }
2348
2349 int c_arg;
2350
2351 // Pre-load a static method's oop into r14. Used both by locking code and
2352 // the normal JNI call code.
2353 // point c_arg at the first arg that is already loaded in case we
2354 // need to spill before we call out
2355 c_arg = total_c_args - total_in_args;
2356
2357 if (method->is_static()) {
2358
2359 // load oop into a register
2360 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2361
2362 // Now handlize the static class mirror it's known not-null.
2363 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2364 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2365
2366 // Now get the handle
2367 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2368 // store the klass handle as second argument
2369 __ movptr(c_rarg1, oop_handle_reg);
2370 // and protect the arg if we must spill
2371 c_arg--;
2372 }
2373
2374 // Change state to native (we save the return address in the thread, since it might not
2375 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2376 // points into the right code segment. It does not have to be the correct return pc.
2377 // We use the same pc/oopMap repeatedly when we call out
2378
2379 Label native_return;
2380 if (method->is_object_wait0()) {
2381 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2382 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2383 } else {
2384 intptr_t the_pc = (intptr_t) __ pc();
2385 oop_maps->add_gc_map(the_pc - start, map);
2386
2387 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2388 }
2389
2390 // We have all of the arguments setup at this point. We must not touch any register
2391 // argument registers at this point (what if we save/restore them there are no oop?
2392
2393 if (DTraceMethodProbes) {
2394 // protect the args we've loaded
2395 save_args(masm, total_c_args, c_arg, out_regs);
2396 __ mov_metadata(c_rarg1, method());
2397 __ call_VM_leaf(
2398 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2399 r15_thread, c_rarg1);
2400 restore_args(masm, total_c_args, c_arg, out_regs);
2401 }
2402
2403 // RedefineClasses() tracing support for obsolete method entry
2404 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2405 // protect the args we've loaded
2406 save_args(masm, total_c_args, c_arg, out_regs);
2407 __ mov_metadata(c_rarg1, method());
2408 __ call_VM_leaf(
2409 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2410 r15_thread, c_rarg1);
2411 restore_args(masm, total_c_args, c_arg, out_regs);
2412 }
2413
2414 // Lock a synchronized method
2415
2416 // Register definitions used by locking and unlocking
2417
2418 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2419 const Register obj_reg = rbx; // Will contain the oop
2420 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2421
2422 Label slow_path_lock;
2423 Label lock_done;
2424
2425 if (method->is_synchronized()) {
2426 // Get the handle (the 2nd argument)
2427 __ mov(oop_handle_reg, c_rarg1);
2428
2429 // Get address of the box
2430
2431 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2432
2433 // Load the oop from the handle
2434 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2435
2436 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2437
2438 // Slow path will re-enter here
2439 __ bind(lock_done);
2440 }
2441
2442 // Finally just about ready to make the JNI call
2443
2444 // get JNIEnv* which is first argument to native
2445 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2446
2447 // Now set thread in native
2448 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2449
2450 __ call(RuntimeAddress(native_func));
2451
2452 // Verify or restore cpu control state after JNI call
2453 __ restore_cpu_control_state_after_jni(rscratch1);
2454
2455 // Unpack native results.
2456 switch (ret_type) {
2457 case T_BOOLEAN: __ c2bool(rax); break;
2458 case T_CHAR : __ movzwl(rax, rax); break;
2459 case T_BYTE : __ sign_extend_byte (rax); break;
2460 case T_SHORT : __ sign_extend_short(rax); break;
2461 case T_INT : /* nothing to do */ break;
2462 case T_DOUBLE :
2463 case T_FLOAT :
2464 // Result is in xmm0 we'll save as needed
2465 break;
2466 case T_ARRAY: // Really a handle
2467 case T_OBJECT: // Really a handle
2468 break; // can't de-handlize until after safepoint check
2469 case T_VOID: break;
2470 case T_LONG: break;
2471 default : ShouldNotReachHere();
2472 }
2473
2474 // Switch thread to "native transition" state before reading the synchronization state.
2475 // This additional state is necessary because reading and testing the synchronization
2476 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2477 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2478 // VM thread changes sync state to synchronizing and suspends threads for GC.
2479 // Thread A is resumed to finish this native method, but doesn't block here since it
2480 // didn't see any synchronization is progress, and escapes.
2481 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2482
2483 // Force this write out before the read below
2484 if (!UseSystemMemoryBarrier) {
2485 __ membar(Assembler::Membar_mask_bits(
2486 Assembler::LoadLoad | Assembler::LoadStore |
2487 Assembler::StoreLoad | Assembler::StoreStore));
2488 }
2489
2490 // check for safepoint operation in progress and/or pending suspend requests
2491 {
2492 Label Continue;
2493 Label slow_path;
2494
2495 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2496
2497 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2498 __ jcc(Assembler::equal, Continue);
2499 __ bind(slow_path);
2500
2501 // Don't use call_VM as it will see a possible pending exception and forward it
2502 // and never return here preventing us from clearing _last_native_pc down below.
2503 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2504 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2505 // by hand.
2506 //
2507 __ vzeroupper();
2508 save_native_result(masm, ret_type, stack_slots);
2509 __ mov(c_rarg0, r15_thread);
2510 __ mov(r12, rsp); // remember sp
2511 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2512 __ andptr(rsp, -16); // align stack as required by ABI
2513 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2514 __ mov(rsp, r12); // restore sp
2515 __ reinit_heapbase();
2516 // Restore any method result value
2517 restore_native_result(masm, ret_type, stack_slots);
2518 __ bind(Continue);
2519 }
2520
2521 // change thread state
2522 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2523
2524 if (method->is_object_wait0()) {
2525 // Check preemption for Object.wait()
2526 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2527 __ cmpptr(rscratch1, NULL_WORD);
2528 __ jccb(Assembler::equal, native_return);
2529 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2530 __ jmp(rscratch1);
2531 __ bind(native_return);
2532
2533 intptr_t the_pc = (intptr_t) __ pc();
2534 oop_maps->add_gc_map(the_pc - start, map);
2535 }
2536
2537
2538 Label reguard;
2539 Label reguard_done;
2540 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2541 __ jcc(Assembler::equal, reguard);
2542 __ bind(reguard_done);
2543
2544 // native result if any is live
2545
2546 // Unlock
2547 Label slow_path_unlock;
2548 Label unlock_done;
2549 if (method->is_synchronized()) {
2550
2551 Label fast_done;
2552
2553 // Get locked oop from the handle we passed to jni
2554 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2555
2556 // Must save rax if it is live now because cmpxchg must use it
2557 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2558 save_native_result(masm, ret_type, stack_slots);
2559 }
2560
2561 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2562
2563 // slow path re-enters here
2564 __ bind(unlock_done);
2565 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2566 restore_native_result(masm, ret_type, stack_slots);
2567 }
2568
2569 __ bind(fast_done);
2570 }
2571 if (DTraceMethodProbes) {
2572 save_native_result(masm, ret_type, stack_slots);
2573 __ mov_metadata(c_rarg1, method());
2574 __ call_VM_leaf(
2575 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2576 r15_thread, c_rarg1);
2577 restore_native_result(masm, ret_type, stack_slots);
2578 }
2579
2580 __ reset_last_Java_frame(false);
2581
2582 // Unbox oop result, e.g. JNIHandles::resolve value.
2583 if (is_reference_type(ret_type)) {
2584 __ resolve_jobject(rax /* value */,
2585 rcx /* tmp */);
2586 }
2587
2588 if (CheckJNICalls) {
2589 // clear_pending_jni_exception_check
2590 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2591 }
2592
2593 // reset handle block
2594 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2595 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2596
2597 // pop our frame
2598
2599 __ leave();
2600
2601 #if INCLUDE_JFR
2602 // We need to do a poll test after unwind in case the sampler
2603 // managed to sample the native frame after returning to Java.
2604 Label L_return;
2605 address poll_test_pc = __ pc();
2606 __ relocate(relocInfo::poll_return_type);
2607 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2608 __ jccb(Assembler::zero, L_return);
2609 __ lea(rscratch1, InternalAddress(poll_test_pc));
2610 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2611 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2612 "polling page return stub not created yet");
2613 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2614 __ jump(RuntimeAddress(stub));
2615 __ bind(L_return);
2616 #endif // INCLUDE_JFR
2617
2618 // Any exception pending?
2619 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2620 __ jcc(Assembler::notEqual, exception_pending);
2621
2622 // Return
2623
2624 __ ret(0);
2625
2626 // Unexpected paths are out of line and go here
2627
2628 // forward the exception
2629 __ bind(exception_pending);
2630
2631 // and forward the exception
2632 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2633
2634 // Slow path locking & unlocking
2635 if (method->is_synchronized()) {
2636
2637 // BEGIN Slow path lock
2638 __ bind(slow_path_lock);
2639
2640 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2641 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2642
2643 // protect the args we've loaded
2644 save_args(masm, total_c_args, c_arg, out_regs);
2645
2646 __ mov(c_rarg0, obj_reg);
2647 __ mov(c_rarg1, lock_reg);
2648 __ mov(c_rarg2, r15_thread);
2649
2650 // Not a leaf but we have last_Java_frame setup as we want.
2651 // We don't want to unmount in case of contention since that would complicate preserving
2652 // the arguments that had already been marshalled into the native convention. So we force
2653 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2654 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2655 __ push_cont_fastpath();
2656 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2657 __ pop_cont_fastpath();
2658 restore_args(masm, total_c_args, c_arg, out_regs);
2659
2660 #ifdef ASSERT
2661 { Label L;
2662 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2663 __ jcc(Assembler::equal, L);
2664 __ stop("no pending exception allowed on exit from monitorenter");
2665 __ bind(L);
2666 }
2667 #endif
2668 __ jmp(lock_done);
2669
2670 // END Slow path lock
2671
2672 // BEGIN Slow path unlock
2673 __ bind(slow_path_unlock);
2674
2675 // If we haven't already saved the native result we must save it now as xmm registers
2676 // are still exposed.
2677 __ vzeroupper();
2678 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2679 save_native_result(masm, ret_type, stack_slots);
2680 }
2681
2682 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2683
2684 __ mov(c_rarg0, obj_reg);
2685 __ mov(c_rarg2, r15_thread);
2686 __ mov(r12, rsp); // remember sp
2687 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2688 __ andptr(rsp, -16); // align stack as required by ABI
2689
2690 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2691 // NOTE that obj_reg == rbx currently
2692 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2693 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2694
2695 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2696 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2697 __ mov(rsp, r12); // restore sp
2698 __ reinit_heapbase();
2699 #ifdef ASSERT
2700 {
2701 Label L;
2702 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2703 __ jcc(Assembler::equal, L);
2704 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2705 __ bind(L);
2706 }
2707 #endif /* ASSERT */
2708
2709 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2710
2711 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2712 restore_native_result(masm, ret_type, stack_slots);
2713 }
2714 __ jmp(unlock_done);
2715
2716 // END Slow path unlock
2717
2718 } // synchronized
2719
2720 // SLOW PATH Reguard the stack if needed
2721
2722 __ bind(reguard);
2723 __ vzeroupper();
2724 save_native_result(masm, ret_type, stack_slots);
2725 __ mov(r12, rsp); // remember sp
2726 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2727 __ andptr(rsp, -16); // align stack as required by ABI
2728 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2729 __ mov(rsp, r12); // restore sp
2730 __ reinit_heapbase();
2731 restore_native_result(masm, ret_type, stack_slots);
2732 // and continue
2733 __ jmp(reguard_done);
2734
2735
2736
2737 __ flush();
2738
2739 nmethod *nm = nmethod::new_native_nmethod(method,
2740 compile_id,
2741 masm->code(),
2742 vep_offset,
2743 frame_complete,
2744 stack_slots / VMRegImpl::slots_per_word,
2745 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2746 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2747 oop_maps);
2748
2749 return nm;
2750 }
2751
2752 // this function returns the adjust size (in number of words) to a c2i adapter
2753 // activation for use during deoptimization
2754 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2755 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2756 }
2757
2758
2759 uint SharedRuntime::out_preserve_stack_slots() {
2760 return 0;
2761 }
2762
2763
2764 // Number of stack slots between incoming argument block and the start of
2765 // a new frame. The PROLOG must add this many slots to the stack. The
2766 // EPILOG must remove this many slots. amd64 needs two slots for
2767 // return address.
2768 uint SharedRuntime::in_preserve_stack_slots() {
2769 return 4 + 2 * VerifyStackAtCalls;
2770 }
2771
2772 VMReg SharedRuntime::thread_register() {
2773 return r15_thread->as_VMReg();
2774 }
2775
2776 //------------------------------generate_deopt_blob----------------------------
2777 void SharedRuntime::generate_deopt_blob() {
2778 // Allocate space for the code
2779 ResourceMark rm;
2780 // Setup code generation tools
2781 int pad = 0;
2782 if (UseAVX > 2) {
2783 pad += 1024;
2784 }
2785 if (UseAPX) {
2786 pad += 1024;
2787 }
2788 #if INCLUDE_JVMCI
2789 if (EnableJVMCI) {
2790 pad += 512; // Increase the buffer size when compiling for JVMCI
2791 }
2792 #endif
2793 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2794 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2795 if (blob != nullptr) {
2796 _deopt_blob = blob->as_deoptimization_blob();
2797 return;
2798 }
2799
2800 CodeBuffer buffer(name, 2560+pad, 1024);
2801 MacroAssembler* masm = new MacroAssembler(&buffer);
2802 int frame_size_in_words;
2803 OopMap* map = nullptr;
2804 OopMapSet *oop_maps = new OopMapSet();
2805
2806 // -------------
2807 // This code enters when returning to a de-optimized nmethod. A return
2808 // address has been pushed on the stack, and return values are in
2809 // registers.
2810 // If we are doing a normal deopt then we were called from the patched
2811 // nmethod from the point we returned to the nmethod. So the return
2812 // address on the stack is wrong by NativeCall::instruction_size
2813 // We will adjust the value so it looks like we have the original return
2814 // address on the stack (like when we eagerly deoptimized).
2815 // In the case of an exception pending when deoptimizing, we enter
2816 // with a return address on the stack that points after the call we patched
2817 // into the exception handler. We have the following register state from,
2818 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2819 // rax: exception oop
2820 // rbx: exception handler
2821 // rdx: throwing pc
2822 // So in this case we simply jam rdx into the useless return address and
2823 // the stack looks just like we want.
2824 //
2825 // At this point we need to de-opt. We save the argument return
2826 // registers. We call the first C routine, fetch_unroll_info(). This
2827 // routine captures the return values and returns a structure which
2828 // describes the current frame size and the sizes of all replacement frames.
2829 // The current frame is compiled code and may contain many inlined
2830 // functions, each with their own JVM state. We pop the current frame, then
2831 // push all the new frames. Then we call the C routine unpack_frames() to
2832 // populate these frames. Finally unpack_frames() returns us the new target
2833 // address. Notice that callee-save registers are BLOWN here; they have
2834 // already been captured in the vframeArray at the time the return PC was
2835 // patched.
2836 address start = __ pc();
2837 Label cont;
2838
2839 // Prolog for non exception case!
2840
2841 // Save everything in sight.
2842 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2843
2844 // Normal deoptimization. Save exec mode for unpack_frames.
2845 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2846 __ jmp(cont);
2847
2848 int reexecute_offset = __ pc() - start;
2849 #if INCLUDE_JVMCI && !defined(COMPILER1)
2850 if (UseJVMCICompiler) {
2851 // JVMCI does not use this kind of deoptimization
2852 __ should_not_reach_here();
2853 }
2854 #endif
2855
2856 // Reexecute case
2857 // return address is the pc describes what bci to do re-execute at
2858
2859 // No need to update map as each call to save_live_registers will produce identical oopmap
2860 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2861
2862 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2863 __ jmp(cont);
2864
2865 #if INCLUDE_JVMCI
2866 Label after_fetch_unroll_info_call;
2867 int implicit_exception_uncommon_trap_offset = 0;
2868 int uncommon_trap_offset = 0;
2869
2870 if (EnableJVMCI) {
2871 implicit_exception_uncommon_trap_offset = __ pc() - start;
2872
2873 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2874 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2875
2876 uncommon_trap_offset = __ pc() - start;
2877
2878 // Save everything in sight.
2879 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2880 // fetch_unroll_info needs to call last_java_frame()
2881 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2882
2883 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2884 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2885
2886 __ movl(r14, Deoptimization::Unpack_reexecute);
2887 __ mov(c_rarg0, r15_thread);
2888 __ movl(c_rarg2, r14); // exec mode
2889 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2890 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2891
2892 __ reset_last_Java_frame(false);
2893
2894 __ jmp(after_fetch_unroll_info_call);
2895 } // EnableJVMCI
2896 #endif // INCLUDE_JVMCI
2897
2898 int exception_offset = __ pc() - start;
2899
2900 // Prolog for exception case
2901
2902 // all registers are dead at this entry point, except for rax, and
2903 // rdx which contain the exception oop and exception pc
2904 // respectively. Set them in TLS and fall thru to the
2905 // unpack_with_exception_in_tls entry point.
2906
2907 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2908 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2909
2910 int exception_in_tls_offset = __ pc() - start;
2911
2912 // new implementation because exception oop is now passed in JavaThread
2913
2914 // Prolog for exception case
2915 // All registers must be preserved because they might be used by LinearScan
2916 // Exceptiop oop and throwing PC are passed in JavaThread
2917 // tos: stack at point of call to method that threw the exception (i.e. only
2918 // args are on the stack, no return address)
2919
2920 // make room on stack for the return address
2921 // It will be patched later with the throwing pc. The correct value is not
2922 // available now because loading it from memory would destroy registers.
2923 __ push(0);
2924
2925 // Save everything in sight.
2926 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2927
2928 // Now it is safe to overwrite any register
2929
2930 // Deopt during an exception. Save exec mode for unpack_frames.
2931 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2932
2933 // load throwing pc from JavaThread and patch it as the return address
2934 // of the current frame. Then clear the field in JavaThread
2935
2936 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2937 __ movptr(Address(rbp, wordSize), rdx);
2938 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2939
2940 #ifdef ASSERT
2941 // verify that there is really an exception oop in JavaThread
2942 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2943 __ verify_oop(rax);
2944
2945 // verify that there is no pending exception
2946 Label no_pending_exception;
2947 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2948 __ testptr(rax, rax);
2949 __ jcc(Assembler::zero, no_pending_exception);
2950 __ stop("must not have pending exception here");
2951 __ bind(no_pending_exception);
2952 #endif
2953
2954 __ bind(cont);
2955
2956 // Call C code. Need thread and this frame, but NOT official VM entry
2957 // crud. We cannot block on this call, no GC can happen.
2958 //
2959 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2960
2961 // fetch_unroll_info needs to call last_java_frame().
2962
2963 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2964 #ifdef ASSERT
2965 { Label L;
2966 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2967 __ jcc(Assembler::equal, L);
2968 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2969 __ bind(L);
2970 }
2971 #endif // ASSERT
2972 __ mov(c_rarg0, r15_thread);
2973 __ movl(c_rarg1, r14); // exec_mode
2974 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2975
2976 // Need to have an oopmap that tells fetch_unroll_info where to
2977 // find any register it might need.
2978 oop_maps->add_gc_map(__ pc() - start, map);
2979
2980 __ reset_last_Java_frame(false);
2981
2982 #if INCLUDE_JVMCI
2983 if (EnableJVMCI) {
2984 __ bind(after_fetch_unroll_info_call);
2985 }
2986 #endif
2987
2988 // Load UnrollBlock* into rdi
2989 __ mov(rdi, rax);
2990
2991 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2992 Label noException;
2993 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2994 __ jcc(Assembler::notEqual, noException);
2995 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2996 // QQQ this is useless it was null above
2997 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2998 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2999 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3000
3001 __ verify_oop(rax);
3002
3003 // Overwrite the result registers with the exception results.
3004 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3005 // I think this is useless
3006 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3007
3008 __ bind(noException);
3009
3010 // Only register save data is on the stack.
3011 // Now restore the result registers. Everything else is either dead
3012 // or captured in the vframeArray.
3013 RegisterSaver::restore_result_registers(masm);
3014
3015 // All of the register save area has been popped of the stack. Only the
3016 // return address remains.
3017
3018 // Pop all the frames we must move/replace.
3019 //
3020 // Frame picture (youngest to oldest)
3021 // 1: self-frame (no frame link)
3022 // 2: deopting frame (no frame link)
3023 // 3: caller of deopting frame (could be compiled/interpreted).
3024 //
3025 // Note: by leaving the return address of self-frame on the stack
3026 // and using the size of frame 2 to adjust the stack
3027 // when we are done the return to frame 3 will still be on the stack.
3028
3029 // Pop deoptimized frame
3030 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3031 __ addptr(rsp, rcx);
3032
3033 // rsp should be pointing at the return address to the caller (3)
3034
3035 // Pick up the initial fp we should save
3036 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3037 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3038
3039 #ifdef ASSERT
3040 // Compilers generate code that bang the stack by as much as the
3041 // interpreter would need. So this stack banging should never
3042 // trigger a fault. Verify that it does not on non product builds.
3043 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3044 __ bang_stack_size(rbx, rcx);
3045 #endif
3046
3047 // Load address of array of frame pcs into rcx
3048 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3049
3050 // Trash the old pc
3051 __ addptr(rsp, wordSize);
3052
3053 // Load address of array of frame sizes into rsi
3054 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3055
3056 // Load counter into rdx
3057 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3058
3059 // Now adjust the caller's stack to make up for the extra locals
3060 // but record the original sp so that we can save it in the skeletal interpreter
3061 // frame and the stack walking of interpreter_sender will get the unextended sp
3062 // value and not the "real" sp value.
3063
3064 const Register sender_sp = r8;
3065
3066 __ mov(sender_sp, rsp);
3067 __ movl(rbx, Address(rdi,
3068 Deoptimization::UnrollBlock::
3069 caller_adjustment_offset()));
3070 __ subptr(rsp, rbx);
3071
3072 // Push interpreter frames in a loop
3073 Label loop;
3074 __ bind(loop);
3075 __ movptr(rbx, Address(rsi, 0)); // Load frame size
3076 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
3077 __ pushptr(Address(rcx, 0)); // Save return address
3078 __ enter(); // Save old & set new ebp
3079 __ subptr(rsp, rbx); // Prolog
3080 // This value is corrected by layout_activation_impl
3081 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3082 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3083 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
3084 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
3085 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
3086 __ decrementl(rdx); // Decrement counter
3087 __ jcc(Assembler::notZero, loop);
3088 __ pushptr(Address(rcx, 0)); // Save final return address
3089
3090 // Re-push self-frame
3091 __ enter(); // Save old & set new ebp
3092
3093 // Allocate a full sized register save area.
3094 // Return address and rbp are in place, so we allocate two less words.
3095 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3096
3097 // Restore frame locals after moving the frame
3098 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3099 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3100
3101 // Call C code. Need thread but NOT official VM entry
3102 // crud. We cannot block on this call, no GC can happen. Call should
3103 // restore return values to their stack-slots with the new SP.
3104 //
3105 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3106
3107 // Use rbp because the frames look interpreted now
3108 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3109 // Don't need the precise return PC here, just precise enough to point into this code blob.
3110 address the_pc = __ pc();
3111 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3112
3113 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
3114 __ mov(c_rarg0, r15_thread);
3115 __ movl(c_rarg1, r14); // second arg: exec_mode
3116 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3117 // Revert SP alignment after call since we're going to do some SP relative addressing below
3118 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3119
3120 // Set an oopmap for the call site
3121 // Use the same PC we used for the last java frame
3122 oop_maps->add_gc_map(the_pc - start,
3123 new OopMap( frame_size_in_words, 0 ));
3124
3125 // Clear fp AND pc
3126 __ reset_last_Java_frame(true);
3127
3128 // Collect return values
3129 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3130 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3131 // I think this is useless (throwing pc?)
3132 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3133
3134 // Pop self-frame.
3135 __ leave(); // Epilog
3136
3137 // Jump to interpreter
3138 __ ret(0);
3139
3140 // Make sure all code is generated
3141 masm->flush();
3142
3143 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3144 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3145 #if INCLUDE_JVMCI
3146 if (EnableJVMCI) {
3147 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3148 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3149 }
3150 #endif
3151
3152 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3153 }
3154
3155 //------------------------------generate_handler_blob------
3156 //
3157 // Generate a special Compile2Runtime blob that saves all registers,
3158 // and setup oopmap.
3159 //
3160 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3161 assert(StubRoutines::forward_exception_entry() != nullptr,
3162 "must be generated before");
3163 assert(is_polling_page_id(id), "expected a polling page stub id");
3164
3165 // Allocate space for the code. Setup code generation tools.
3166 const char* name = SharedRuntime::stub_name(id);
3167 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3168 if (blob != nullptr) {
3169 return blob->as_safepoint_blob();
3170 }
3171
3172 ResourceMark rm;
3173 OopMapSet *oop_maps = new OopMapSet();
3174 OopMap* map;
3175 CodeBuffer buffer(name, 2548, 1024);
3176 MacroAssembler* masm = new MacroAssembler(&buffer);
3177
3178 address start = __ pc();
3179 address call_pc = nullptr;
3180 int frame_size_in_words;
3181 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3182 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3183
3184 // Make room for return address (or push it again)
3185 if (!cause_return) {
3186 __ push(rbx);
3187 }
3188
3189 // Save registers, fpu state, and flags
3190 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3191
3192 // The following is basically a call_VM. However, we need the precise
3193 // address of the call in order to generate an oopmap. Hence, we do all the
3194 // work ourselves.
3195
3196 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3197
3198 // The return address must always be correct so that frame constructor never
3199 // sees an invalid pc.
3200
3201 if (!cause_return) {
3202 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3203 // Additionally, rbx is a callee saved register and we can look at it later to determine
3204 // if someone changed the return address for us!
3205 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3206 __ movptr(Address(rbp, wordSize), rbx);
3207 }
3208
3209 // Do the call
3210 __ mov(c_rarg0, r15_thread);
3211 __ call(RuntimeAddress(call_ptr));
3212
3213 // Set an oopmap for the call site. This oopmap will map all
3214 // oop-registers and debug-info registers as callee-saved. This
3215 // will allow deoptimization at this safepoint to find all possible
3216 // debug-info recordings, as well as let GC find all oops.
3217
3218 oop_maps->add_gc_map( __ pc() - start, map);
3219
3220 Label noException;
3221
3222 __ reset_last_Java_frame(false);
3223
3224 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3225 __ jcc(Assembler::equal, noException);
3226
3227 // Exception pending
3228
3229 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3230
3231 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3232
3233 // No exception case
3234 __ bind(noException);
3235
3236 Label no_adjust;
3237 #ifdef ASSERT
3238 Label bail;
3239 #endif
3240 if (!cause_return) {
3241 Label no_prefix, not_special, check_rex_prefix;
3242
3243 // If our stashed return pc was modified by the runtime we avoid touching it
3244 __ cmpptr(rbx, Address(rbp, wordSize));
3245 __ jcc(Assembler::notEqual, no_adjust);
3246
3247 // Skip over the poll instruction.
3248 // See NativeInstruction::is_safepoint_poll()
3249 // Possible encodings:
3250 // 85 00 test %eax,(%rax)
3251 // 85 01 test %eax,(%rcx)
3252 // 85 02 test %eax,(%rdx)
3253 // 85 03 test %eax,(%rbx)
3254 // 85 06 test %eax,(%rsi)
3255 // 85 07 test %eax,(%rdi)
3256 //
3257 // 41 85 00 test %eax,(%r8)
3258 // 41 85 01 test %eax,(%r9)
3259 // 41 85 02 test %eax,(%r10)
3260 // 41 85 03 test %eax,(%r11)
3261 // 41 85 06 test %eax,(%r14)
3262 // 41 85 07 test %eax,(%r15)
3263 //
3264 // 85 04 24 test %eax,(%rsp)
3265 // 41 85 04 24 test %eax,(%r12)
3266 // 85 45 00 test %eax,0x0(%rbp)
3267 // 41 85 45 00 test %eax,0x0(%r13)
3268 //
3269 // Notes:
3270 // Format of legacy MAP0 test instruction:-
3271 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3272 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3273 // operand and base register of memory operand is b/w [0-8), hence we do not require
3274 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3275 // is why two bytes encoding is sufficient here.
3276 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3277 // register of memory operand is 1000, thus we need additional REX prefix in this case,
3278 // there by adding additional byte to instruction encoding.
3279 // o In case BASE register is one of the 32 extended GPR registers available only on targets
3280 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3281 // most significant two bits of 5 bit register encoding.
3282
3283 if (VM_Version::supports_apx_f()) {
3284 __ cmpb(Address(rbx, 0), Assembler::REX2);
3285 __ jccb(Assembler::notEqual, check_rex_prefix);
3286 __ addptr(rbx, 2);
3287 __ bind(check_rex_prefix);
3288 }
3289 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3290 __ jccb(Assembler::notEqual, no_prefix);
3291 __ addptr(rbx, 1);
3292 __ bind(no_prefix);
3293 #ifdef ASSERT
3294 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3295 #endif
3296 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3297 // r12/rsp 0x04
3298 // r13/rbp 0x05
3299 __ movzbq(rcx, Address(rbx, 1));
3300 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3301 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3302 __ cmpptr(rcx, 1);
3303 __ jccb(Assembler::above, not_special);
3304 __ addptr(rbx, 1);
3305 __ bind(not_special);
3306 #ifdef ASSERT
3307 // Verify the correct encoding of the poll we're about to skip.
3308 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3309 __ jcc(Assembler::notEqual, bail);
3310 // Mask out the modrm bits
3311 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3312 // rax encodes to 0, so if the bits are nonzero it's incorrect
3313 __ jcc(Assembler::notZero, bail);
3314 #endif
3315 // Adjust return pc forward to step over the safepoint poll instruction
3316 __ addptr(rbx, 2);
3317 __ movptr(Address(rbp, wordSize), rbx);
3318 }
3319
3320 __ bind(no_adjust);
3321 // Normal exit, restore registers and exit.
3322 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3323 __ ret(0);
3324
3325 #ifdef ASSERT
3326 __ bind(bail);
3327 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3328 #endif
3329
3330 // Make sure all code is generated
3331 masm->flush();
3332
3333 // Fill-out other meta info
3334 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3335
3336 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3337 return sp_blob;
3338 }
3339
3340 //
3341 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3342 //
3343 // Generate a stub that calls into vm to find out the proper destination
3344 // of a java call. All the argument registers are live at this point
3345 // but since this is generic code we don't know what they are and the caller
3346 // must do any gc of the args.
3347 //
3348 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3349 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3350 assert(is_resolve_id(id), "expected a resolve stub id");
3351
3352 const char* name = SharedRuntime::stub_name(id);
3353 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3354 if (blob != nullptr) {
3355 return blob->as_runtime_stub();
3356 }
3357
3358 // allocate space for the code
3359 ResourceMark rm;
3360 CodeBuffer buffer(name, 1552, 512);
3361 MacroAssembler* masm = new MacroAssembler(&buffer);
3362
3363 int frame_size_in_words;
3364
3365 OopMapSet *oop_maps = new OopMapSet();
3366 OopMap* map = nullptr;
3367
3368 int start = __ offset();
3369
3370 // No need to save vector registers since they are caller-saved anyway.
3371 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3372
3373 int frame_complete = __ offset();
3374
3375 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3376
3377 __ mov(c_rarg0, r15_thread);
3378
3379 __ call(RuntimeAddress(destination));
3380
3381
3382 // Set an oopmap for the call site.
3383 // We need this not only for callee-saved registers, but also for volatile
3384 // registers that the compiler might be keeping live across a safepoint.
3385
3386 oop_maps->add_gc_map( __ offset() - start, map);
3387
3388 // rax contains the address we are going to jump to assuming no exception got installed
3389
3390 // clear last_Java_sp
3391 __ reset_last_Java_frame(false);
3392 // check for pending exceptions
3393 Label pending;
3394 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3395 __ jcc(Assembler::notEqual, pending);
3396
3397 // get the returned Method*
3398 __ get_vm_result_metadata(rbx);
3399 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3400
3401 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3402
3403 RegisterSaver::restore_live_registers(masm);
3404
3405 // We are back to the original state on entry and ready to go.
3406
3407 __ jmp(rax);
3408
3409 // Pending exception after the safepoint
3410
3411 __ bind(pending);
3412
3413 RegisterSaver::restore_live_registers(masm);
3414
3415 // exception pending => remove activation and forward to exception handler
3416
3417 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3418
3419 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3420 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3421
3422 // -------------
3423 // make sure all code is generated
3424 masm->flush();
3425
3426 // return the blob
3427 // frame_size_words or bytes??
3428 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3429
3430 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3431 return rs_blob;
3432 }
3433
3434 // Continuation point for throwing of implicit exceptions that are
3435 // not handled in the current activation. Fabricates an exception
3436 // oop and initiates normal exception dispatching in this
3437 // frame. Since we need to preserve callee-saved values (currently
3438 // only for C2, but done for C1 as well) we need a callee-saved oop
3439 // map and therefore have to make these stubs into RuntimeStubs
3440 // rather than BufferBlobs. If the compiler needs all registers to
3441 // be preserved between the fault point and the exception handler
3442 // then it must assume responsibility for that in
3443 // AbstractCompiler::continuation_for_implicit_null_exception or
3444 // continuation_for_implicit_division_by_zero_exception. All other
3445 // implicit exceptions (e.g., NullPointerException or
3446 // AbstractMethodError on entry) are either at call sites or
3447 // otherwise assume that stack unwinding will be initiated, so
3448 // caller saved registers were assumed volatile in the compiler.
3449 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3450 assert(is_throw_id(id), "expected a throw stub id");
3451
3452 const char* name = SharedRuntime::stub_name(id);
3453
3454 // Information about frame layout at time of blocking runtime call.
3455 // Note that we only have to preserve callee-saved registers since
3456 // the compilers are responsible for supplying a continuation point
3457 // if they expect all registers to be preserved.
3458 enum layout {
3459 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3460 rbp_off2,
3461 return_off,
3462 return_off2,
3463 framesize // inclusive of return address
3464 };
3465
3466 int insts_size = 512;
3467 int locs_size = 64;
3468
3469 const char* timer_msg = "SharedRuntime generate_throw_exception";
3470 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3471
3472 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3473 if (blob != nullptr) {
3474 return blob->as_runtime_stub();
3475 }
3476
3477 ResourceMark rm;
3478 CodeBuffer code(name, insts_size, locs_size);
3479 OopMapSet* oop_maps = new OopMapSet();
3480 MacroAssembler* masm = new MacroAssembler(&code);
3481
3482 address start = __ pc();
3483
3484 // This is an inlined and slightly modified version of call_VM
3485 // which has the ability to fetch the return PC out of
3486 // thread-local storage and also sets up last_Java_sp slightly
3487 // differently than the real call_VM
3488
3489 __ enter(); // required for proper stackwalking of RuntimeStub frame
3490
3491 assert(is_even(framesize/2), "sp not 16-byte aligned");
3492
3493 // return address and rbp are already in place
3494 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3495
3496 int frame_complete = __ pc() - start;
3497
3498 // Set up last_Java_sp and last_Java_fp
3499 address the_pc = __ pc();
3500 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3501 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3502
3503 // Call runtime
3504 __ movptr(c_rarg0, r15_thread);
3505 BLOCK_COMMENT("call runtime_entry");
3506 __ call(RuntimeAddress(runtime_entry));
3507
3508 // Generate oop map
3509 OopMap* map = new OopMap(framesize, 0);
3510
3511 oop_maps->add_gc_map(the_pc - start, map);
3512
3513 __ reset_last_Java_frame(true);
3514
3515 __ leave(); // required for proper stackwalking of RuntimeStub frame
3516
3517 // check for pending exceptions
3518 #ifdef ASSERT
3519 Label L;
3520 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3521 __ jcc(Assembler::notEqual, L);
3522 __ should_not_reach_here();
3523 __ bind(L);
3524 #endif // ASSERT
3525 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3526
3527
3528 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3529 RuntimeStub* stub =
3530 RuntimeStub::new_runtime_stub(name,
3531 &code,
3532 frame_complete,
3533 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3534 oop_maps, false);
3535 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3536
3537 return stub;
3538 }
3539
3540 //------------------------------Montgomery multiplication------------------------
3541 //
3542
3543 #ifndef _WINDOWS
3544
3545 // Subtract 0:b from carry:a. Return carry.
3546 static julong
3547 sub(julong a[], julong b[], julong carry, long len) {
3548 long long i = 0, cnt = len;
3549 julong tmp;
3550 asm volatile("clc; "
3551 "0: ; "
3552 "mov (%[b], %[i], 8), %[tmp]; "
3553 "sbb %[tmp], (%[a], %[i], 8); "
3554 "inc %[i]; dec %[cnt]; "
3555 "jne 0b; "
3556 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3557 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3558 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3559 : "memory");
3560 return tmp;
3561 }
3562
3563 // Multiply (unsigned) Long A by Long B, accumulating the double-
3564 // length result into the accumulator formed of T0, T1, and T2.
3565 #define MACC(A, B, T0, T1, T2) \
3566 do { \
3567 unsigned long hi, lo; \
3568 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3569 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3570 : "r"(A), "a"(B) : "cc"); \
3571 } while(0)
3572
3573 // As above, but add twice the double-length result into the
3574 // accumulator.
3575 #define MACC2(A, B, T0, T1, T2) \
3576 do { \
3577 unsigned long hi, lo; \
3578 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3579 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3580 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3581 : "r"(A), "a"(B) : "cc"); \
3582 } while(0)
3583
3584 #else //_WINDOWS
3585
3586 static julong
3587 sub(julong a[], julong b[], julong carry, long len) {
3588 long i;
3589 julong tmp;
3590 unsigned char c = 1;
3591 for (i = 0; i < len; i++) {
3592 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3593 a[i] = tmp;
3594 }
3595 c = _addcarry_u64(c, carry, ~0, &tmp);
3596 return tmp;
3597 }
3598
3599 // Multiply (unsigned) Long A by Long B, accumulating the double-
3600 // length result into the accumulator formed of T0, T1, and T2.
3601 #define MACC(A, B, T0, T1, T2) \
3602 do { \
3603 julong hi, lo; \
3604 lo = _umul128(A, B, &hi); \
3605 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3606 c = _addcarry_u64(c, hi, T1, &T1); \
3607 _addcarry_u64(c, T2, 0, &T2); \
3608 } while(0)
3609
3610 // As above, but add twice the double-length result into the
3611 // accumulator.
3612 #define MACC2(A, B, T0, T1, T2) \
3613 do { \
3614 julong hi, lo; \
3615 lo = _umul128(A, B, &hi); \
3616 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3617 c = _addcarry_u64(c, hi, T1, &T1); \
3618 _addcarry_u64(c, T2, 0, &T2); \
3619 c = _addcarry_u64(0, lo, T0, &T0); \
3620 c = _addcarry_u64(c, hi, T1, &T1); \
3621 _addcarry_u64(c, T2, 0, &T2); \
3622 } while(0)
3623
3624 #endif //_WINDOWS
3625
3626 // Fast Montgomery multiplication. The derivation of the algorithm is
3627 // in A Cryptographic Library for the Motorola DSP56000,
3628 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3629
3630 static void NOINLINE
3631 montgomery_multiply(julong a[], julong b[], julong n[],
3632 julong m[], julong inv, int len) {
3633 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3634 int i;
3635
3636 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3637
3638 for (i = 0; i < len; i++) {
3639 int j;
3640 for (j = 0; j < i; j++) {
3641 MACC(a[j], b[i-j], t0, t1, t2);
3642 MACC(m[j], n[i-j], t0, t1, t2);
3643 }
3644 MACC(a[i], b[0], t0, t1, t2);
3645 m[i] = t0 * inv;
3646 MACC(m[i], n[0], t0, t1, t2);
3647
3648 assert(t0 == 0, "broken Montgomery multiply");
3649
3650 t0 = t1; t1 = t2; t2 = 0;
3651 }
3652
3653 for (i = len; i < 2*len; i++) {
3654 int j;
3655 for (j = i-len+1; j < len; j++) {
3656 MACC(a[j], b[i-j], t0, t1, t2);
3657 MACC(m[j], n[i-j], t0, t1, t2);
3658 }
3659 m[i-len] = t0;
3660 t0 = t1; t1 = t2; t2 = 0;
3661 }
3662
3663 while (t0)
3664 t0 = sub(m, n, t0, len);
3665 }
3666
3667 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3668 // multiplies so it should be up to 25% faster than Montgomery
3669 // multiplication. However, its loop control is more complex and it
3670 // may actually run slower on some machines.
3671
3672 static void NOINLINE
3673 montgomery_square(julong a[], julong n[],
3674 julong m[], julong inv, int len) {
3675 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3676 int i;
3677
3678 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3679
3680 for (i = 0; i < len; i++) {
3681 int j;
3682 int end = (i+1)/2;
3683 for (j = 0; j < end; j++) {
3684 MACC2(a[j], a[i-j], t0, t1, t2);
3685 MACC(m[j], n[i-j], t0, t1, t2);
3686 }
3687 if ((i & 1) == 0) {
3688 MACC(a[j], a[j], t0, t1, t2);
3689 }
3690 for (; j < i; j++) {
3691 MACC(m[j], n[i-j], t0, t1, t2);
3692 }
3693 m[i] = t0 * inv;
3694 MACC(m[i], n[0], t0, t1, t2);
3695
3696 assert(t0 == 0, "broken Montgomery square");
3697
3698 t0 = t1; t1 = t2; t2 = 0;
3699 }
3700
3701 for (i = len; i < 2*len; i++) {
3702 int start = i-len+1;
3703 int end = start + (len - start)/2;
3704 int j;
3705 for (j = start; j < end; j++) {
3706 MACC2(a[j], a[i-j], t0, t1, t2);
3707 MACC(m[j], n[i-j], t0, t1, t2);
3708 }
3709 if ((i & 1) == 0) {
3710 MACC(a[j], a[j], t0, t1, t2);
3711 }
3712 for (; j < len; j++) {
3713 MACC(m[j], n[i-j], t0, t1, t2);
3714 }
3715 m[i-len] = t0;
3716 t0 = t1; t1 = t2; t2 = 0;
3717 }
3718
3719 while (t0)
3720 t0 = sub(m, n, t0, len);
3721 }
3722
3723 // Swap words in a longword.
3724 static julong swap(julong x) {
3725 return (x << 32) | (x >> 32);
3726 }
3727
3728 // Copy len longwords from s to d, word-swapping as we go. The
3729 // destination array is reversed.
3730 static void reverse_words(julong *s, julong *d, int len) {
3731 d += len;
3732 while(len-- > 0) {
3733 d--;
3734 *d = swap(*s);
3735 s++;
3736 }
3737 }
3738
3739 // The threshold at which squaring is advantageous was determined
3740 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3741 #define MONTGOMERY_SQUARING_THRESHOLD 64
3742
3743 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3744 jint len, jlong inv,
3745 jint *m_ints) {
3746 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3747 int longwords = len/2;
3748
3749 // Make very sure we don't use so much space that the stack might
3750 // overflow. 512 jints corresponds to an 16384-bit integer and
3751 // will use here a total of 8k bytes of stack space.
3752 int divisor = sizeof(julong) * 4;
3753 guarantee(longwords <= 8192 / divisor, "must be");
3754 int total_allocation = longwords * sizeof (julong) * 4;
3755 julong *scratch = (julong *)alloca(total_allocation);
3756
3757 // Local scratch arrays
3758 julong
3759 *a = scratch + 0 * longwords,
3760 *b = scratch + 1 * longwords,
3761 *n = scratch + 2 * longwords,
3762 *m = scratch + 3 * longwords;
3763
3764 reverse_words((julong *)a_ints, a, longwords);
3765 reverse_words((julong *)b_ints, b, longwords);
3766 reverse_words((julong *)n_ints, n, longwords);
3767
3768 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3769
3770 reverse_words(m, (julong *)m_ints, longwords);
3771 }
3772
3773 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3774 jint len, jlong inv,
3775 jint *m_ints) {
3776 assert(len % 2 == 0, "array length in montgomery_square must be even");
3777 int longwords = len/2;
3778
3779 // Make very sure we don't use so much space that the stack might
3780 // overflow. 512 jints corresponds to an 16384-bit integer and
3781 // will use here a total of 6k bytes of stack space.
3782 int divisor = sizeof(julong) * 3;
3783 guarantee(longwords <= (8192 / divisor), "must be");
3784 int total_allocation = longwords * sizeof (julong) * 3;
3785 julong *scratch = (julong *)alloca(total_allocation);
3786
3787 // Local scratch arrays
3788 julong
3789 *a = scratch + 0 * longwords,
3790 *n = scratch + 1 * longwords,
3791 *m = scratch + 2 * longwords;
3792
3793 reverse_words((julong *)a_ints, a, longwords);
3794 reverse_words((julong *)n_ints, n, longwords);
3795
3796 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3797 ::montgomery_square(a, n, m, (julong)inv, longwords);
3798 } else {
3799 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3800 }
3801
3802 reverse_words(m, (julong *)m_ints, longwords);
3803 }
3804
3805 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3806 BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3807 if (buf == nullptr) {
3808 return nullptr;
3809 }
3810 CodeBuffer buffer(buf);
3811 short buffer_locs[20];
3812 buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3813 sizeof(buffer_locs)/sizeof(relocInfo));
3814
3815 MacroAssembler* masm = new MacroAssembler(&buffer);
3816
3817 const Array<SigEntry>* sig_vk = vk->extended_sig();
3818 const Array<VMRegPair>* regs = vk->return_regs();
3819
3820 int pack_fields_jobject_off = __ offset();
3821 // Resolve pre-allocated buffer from JNI handle.
3822 // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3823 __ movptr(rax, Address(r13, 0));
3824 __ resolve_jobject(rax /* value */,
3825 r12 /* tmp */);
3826 __ movptr(Address(r13, 0), rax);
3827
3828 int pack_fields_off = __ offset();
3829
3830 int j = 1;
3831 for (int i = 0; i < sig_vk->length(); i++) {
3832 BasicType bt = sig_vk->at(i)._bt;
3833 if (bt == T_METADATA) {
3834 continue;
3835 }
3836 if (bt == T_VOID) {
3837 if (sig_vk->at(i-1)._bt == T_LONG ||
3838 sig_vk->at(i-1)._bt == T_DOUBLE) {
3839 j++;
3840 }
3841 continue;
3842 }
3843 int off = sig_vk->at(i)._offset;
3844 assert(off > 0, "offset in object should be positive");
3845 VMRegPair pair = regs->at(j);
3846 VMReg r_1 = pair.first();
3847 Address to(rax, off);
3848 if (bt == T_FLOAT) {
3849 __ movflt(to, r_1->as_XMMRegister());
3850 } else if (bt == T_DOUBLE) {
3851 __ movdbl(to, r_1->as_XMMRegister());
3852 } else {
3853 Register val = r_1->as_Register();
3854 assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3855 if (is_reference_type(bt)) {
3856 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3857 __ mov(rbx, rax);
3858 Address to_with_rbx(rbx, off);
3859 __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3860 } else {
3861 __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3862 }
3863 }
3864 j++;
3865 }
3866 assert(j == regs->length(), "missed a field?");
3867 if (vk->supports_nullable_layouts()) {
3868 // Set the null marker
3869 __ movb(Address(rax, vk->null_marker_offset()), 1);
3870 }
3871 __ ret(0);
3872
3873 int unpack_fields_off = __ offset();
3874
3875 Label skip;
3876 Label not_null;
3877 __ testptr(rax, rax);
3878 __ jcc(Assembler::notZero, not_null);
3879
3880 // Return value is null. Zero all registers because the runtime requires a canonical
3881 // representation of a flat null.
3882 j = 1;
3883 for (int i = 0; i < sig_vk->length(); i++) {
3884 BasicType bt = sig_vk->at(i)._bt;
3885 if (bt == T_METADATA) {
3886 continue;
3887 }
3888 if (bt == T_VOID) {
3889 if (sig_vk->at(i-1)._bt == T_LONG ||
3890 sig_vk->at(i-1)._bt == T_DOUBLE) {
3891 j++;
3892 }
3893 continue;
3894 }
3895
3896 VMRegPair pair = regs->at(j);
3897 VMReg r_1 = pair.first();
3898 if (r_1->is_XMMRegister()) {
3899 __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3900 } else {
3901 __ xorl(r_1->as_Register(), r_1->as_Register());
3902 }
3903 j++;
3904 }
3905 __ jmp(skip);
3906 __ bind(not_null);
3907
3908 j = 1;
3909 for (int i = 0; i < sig_vk->length(); i++) {
3910 BasicType bt = sig_vk->at(i)._bt;
3911 if (bt == T_METADATA) {
3912 continue;
3913 }
3914 if (bt == T_VOID) {
3915 if (sig_vk->at(i-1)._bt == T_LONG ||
3916 sig_vk->at(i-1)._bt == T_DOUBLE) {
3917 j++;
3918 }
3919 continue;
3920 }
3921 int off = sig_vk->at(i)._offset;
3922 assert(off > 0, "offset in object should be positive");
3923 VMRegPair pair = regs->at(j);
3924 VMReg r_1 = pair.first();
3925 VMReg r_2 = pair.second();
3926 Address from(rax, off);
3927 if (bt == T_FLOAT) {
3928 __ movflt(r_1->as_XMMRegister(), from);
3929 } else if (bt == T_DOUBLE) {
3930 __ movdbl(r_1->as_XMMRegister(), from);
3931 } else if (bt == T_OBJECT || bt == T_ARRAY) {
3932 assert_different_registers(rax, r_1->as_Register());
3933 __ load_heap_oop(r_1->as_Register(), from);
3934 } else {
3935 assert(is_java_primitive(bt), "unexpected basic type");
3936 assert_different_registers(rax, r_1->as_Register());
3937 size_t size_in_bytes = type2aelembytes(bt);
3938 __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3939 }
3940 j++;
3941 }
3942 assert(j == regs->length(), "missed a field?");
3943
3944 __ bind(skip);
3945 __ ret(0);
3946
3947 __ flush();
3948
3949 return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3950 }
3951
3952 #if INCLUDE_JFR
3953
3954 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3955 // It returns a jobject handle to the event writer.
3956 // The handle is dereferenced and the return value is the event writer oop.
3957 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3958 enum layout {
3959 rbp_off,
3960 rbpH_off,
3961 return_off,
3962 return_off2,
3963 framesize // inclusive of return address
3964 };
3965
3966 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3967 CodeBuffer code(name, 1024, 64);
3968 MacroAssembler* masm = new MacroAssembler(&code);
3969 address start = __ pc();
3970
3971 __ enter();
3972 address the_pc = __ pc();
3973
3974 int frame_complete = the_pc - start;
3975
3976 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3977 __ movptr(c_rarg0, r15_thread);
3978 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3979 __ reset_last_Java_frame(true);
3980
3981 // rax is jobject handle result, unpack and process it through a barrier.
3982 __ resolve_global_jobject(rax, c_rarg0);
3983
3984 __ leave();
3985 __ ret(0);
3986
3987 OopMapSet* oop_maps = new OopMapSet();
3988 OopMap* map = new OopMap(framesize, 1);
3989 oop_maps->add_gc_map(frame_complete, map);
3990
3991 RuntimeStub* stub =
3992 RuntimeStub::new_runtime_stub(name,
3993 &code,
3994 frame_complete,
3995 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3996 oop_maps,
3997 false);
3998 return stub;
3999 }
4000
4001 // For c2: call to return a leased buffer.
4002 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4003 enum layout {
4004 rbp_off,
4005 rbpH_off,
4006 return_off,
4007 return_off2,
4008 framesize // inclusive of return address
4009 };
4010
4011 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
4012 CodeBuffer code(name, 1024, 64);
4013 MacroAssembler* masm = new MacroAssembler(&code);
4014 address start = __ pc();
4015
4016 __ enter();
4017 address the_pc = __ pc();
4018
4019 int frame_complete = the_pc - start;
4020
4021 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4022 __ movptr(c_rarg0, r15_thread);
4023 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4024 __ reset_last_Java_frame(true);
4025
4026 __ leave();
4027 __ ret(0);
4028
4029 OopMapSet* oop_maps = new OopMapSet();
4030 OopMap* map = new OopMap(framesize, 1);
4031 oop_maps->add_gc_map(frame_complete, map);
4032
4033 RuntimeStub* stub =
4034 RuntimeStub::new_runtime_stub(name,
4035 &code,
4036 frame_complete,
4037 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4038 oop_maps,
4039 false);
4040 return stub;
4041 }
4042
4043 #endif // INCLUDE_JFR